Skip to content

Commit

Permalink
Begin adding an object cache for shader modules.
Browse files Browse the repository at this point in the history
  • Loading branch information
HansKristian-Work committed Jul 4, 2019
1 parent 7639c28 commit a536eb9
Show file tree
Hide file tree
Showing 11 changed files with 606 additions and 25 deletions.
1 change: 1 addition & 0 deletions CMakeLists.txt
Original file line number Diff line number Diff line change
Expand Up @@ -56,6 +56,7 @@ add_library(fossilize STATIC
fossilize_types.hpp
varint.cpp varint.hpp
fossilize_db.cpp fossilize_db.hpp
util/intrusive_list.hpp util/object_pool.hpp util/object_cache.hpp
path.hpp path.cpp)
set_target_properties(fossilize PROPERTIES POSITION_INDEPENDENT_CODE ON)

Expand Down
111 changes: 87 additions & 24 deletions cli/fossilize_replay.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -31,6 +31,7 @@
#include "fossilize_external_replayer.hpp"
#include "fossilize_external_replayer_control_block.hpp"
#include "fossilize_errors.hpp"
#include "util/object_cache.hpp"

#include <cinttypes>
#include <string>
Expand Down Expand Up @@ -213,6 +214,8 @@ struct ThreadedReplayer : StateCreatorInterface
// VALVE: --loop option for testing performance
unsigned loop_count = 1;

unsigned shader_cache_size_mb = 256;

// Carve out a range of which pipelines to replay.
// Used for multi-process replays where each process gets its own slice to churn through.
unsigned start_graphics_index = 0;
Expand Down Expand Up @@ -311,6 +314,15 @@ struct ThreadedReplayer : StateCreatorInterface
shader_module_total_compressed_size.store(0);
shader_module_total_size.store(0);
per_thread_data.resize(num_worker_threads + 1);

// Could potentially overflow on 32-bit.
size_t target_size;
if (opts.shader_cache_size_mb <= (SIZE_MAX / (1024 * 1024)))
target_size = size_t(opts.shader_cache_size_mb) * 1024 * 1024;
else
target_size = SIZE_MAX;

shader_modules.set_target_size(target_size);
}

PerThreadData &get_per_thread_data()
Expand Down Expand Up @@ -714,9 +726,6 @@ struct ThreadedReplayer : StateCreatorInterface
for (auto &pipeline_layout : pipeline_layouts)
if (pipeline_layout.second)
vkDestroyPipelineLayout(device->get_device(), pipeline_layout.second, nullptr);
for (auto &shader_module : shader_modules)
if (shader_module.second)
vkDestroyShaderModule(device->get_device(), shader_module.second, nullptr);
for (auto &render_pass : render_passes)
if (render_pass.second)
vkDestroyRenderPass(device->get_device(), render_pass.second, nullptr);
Expand All @@ -726,6 +735,11 @@ struct ThreadedReplayer : StateCreatorInterface
for (auto &pipeline : graphics_pipelines)
if (pipeline.second)
vkDestroyPipeline(device->get_device(), pipeline.second, nullptr);

shader_modules.delete_cache([this](Hash, VkShaderModule module) {
if (module != VK_NULL_HANDLE)
vkDestroyShaderModule(device->get_device(), module, nullptr);
});
}

bool validate_pipeline_cache_header(const vector<uint8_t> &blob)
Expand Down Expand Up @@ -917,11 +931,12 @@ struct ThreadedReplayer : StateCreatorInterface

bool enqueue_create_shader_module(Hash hash, const VkShaderModuleCreateInfo *create_info, VkShaderModule *module) override
{
*module = VK_NULL_HANDLE;
if (masked_shader_modules.count(hash))
{
*module = VK_NULL_HANDLE;
lock_guard<mutex> lock(internal_enqueue_mutex);
shader_modules[hash] = VK_NULL_HANDLE;
//LOGI("Inserting shader module %016llx.\n", static_cast<unsigned long long>(hash));
shader_modules.insert_object(hash, *module, 1);
return true;
}

Expand All @@ -945,7 +960,8 @@ struct ThreadedReplayer : StateCreatorInterface
LOGE("Failed to validate SPIR-V module: %0" PRIX64 "\n", hash);
*module = VK_NULL_HANDLE;
lock_guard<mutex> lock(internal_enqueue_mutex);
shader_modules[hash] = VK_NULL_HANDLE;
//LOGI("Inserting shader module %016llx.\n", static_cast<unsigned long long>(hash));
shader_modules.insert_object(hash, VK_NULL_HANDLE, 1);
shader_module_count.fetch_add(1, std::memory_order_relaxed);

if (opts.control_block)
Expand All @@ -956,18 +972,12 @@ struct ThreadedReplayer : StateCreatorInterface
}
#endif

VkShaderModule *hash_map_entry;
{
lock_guard<mutex> lock(internal_enqueue_mutex);
hash_map_entry = &shader_modules[hash];
}

for (unsigned i = 0; i < loop_count; i++)
{
// Avoid leak.
if (*hash_map_entry != VK_NULL_HANDLE)
vkDestroyShaderModule(device->get_device(), *hash_map_entry, nullptr);
*hash_map_entry = VK_NULL_HANDLE;
if (*module != VK_NULL_HANDLE)
vkDestroyShaderModule(device->get_device(), *module, nullptr);
*module = VK_NULL_HANDLE;

auto start_time = chrono::steady_clock::now();
if (vkCreateShaderModule(device->get_device(), create_info, nullptr, module) == VK_SUCCESS)
Expand All @@ -976,7 +986,6 @@ struct ThreadedReplayer : StateCreatorInterface
auto duration_ns = chrono::duration_cast<chrono::nanoseconds>(end_time - start_time).count();
shader_module_ns.fetch_add(duration_ns, std::memory_order_relaxed);
shader_module_count.fetch_add(1, std::memory_order_relaxed);
*hash_map_entry = *module;

if (robustness)
{
Expand All @@ -994,6 +1003,12 @@ struct ThreadedReplayer : StateCreatorInterface
}
}

{
lock_guard<mutex> lock(internal_enqueue_mutex);
//LOGI("Inserting shader module %016llx.\n", static_cast<unsigned long long>(hash));
shader_modules.insert_object(hash, *module, create_info->codeSize);
}

return true;
}

Expand Down Expand Up @@ -1167,10 +1182,14 @@ struct ThreadedReplayer : StateCreatorInterface
work_item.memory_context_index = SHADER_MODULE_MEMORY_CONTEXT;
enqueue_work_item(work_item);
enqueued_shader_modules.insert(shader_module_hash);
//LOGI("Queueing up shader module: %016llx.\n", static_cast<unsigned long long>((Hash) shader_module_hash));
return true;
}
else
{
//LOGI("Not queueing up shader module: %016llx.\n", static_cast<unsigned long long>((Hash) shader_module_hash));
return false;
}
}

bool enqueue_shader_modules(const VkGraphicsPipelineCreateInfo *info)
Expand All @@ -1192,15 +1211,25 @@ struct ThreadedReplayer : StateCreatorInterface
{
for (uint32_t i = 0; i < info->stageCount; i++)
{
const_cast<VkPipelineShaderStageCreateInfo *>(info->pStages)[i].module =
shader_modules[(Hash) info->pStages[i].module];
auto result = shader_modules.find_object((Hash) info->pStages[i].module);
if (!result.second)
{
LOGE("Could not find shader module %016llx in cache.\n",
static_cast<unsigned long long>((Hash) info->pStages[i].module));
}
const_cast<VkPipelineShaderStageCreateInfo *>(info->pStages)[i].module = result.first;
}
}

void resolve_shader_modules(VkComputePipelineCreateInfo *info)
{
const_cast<VkComputePipelineCreateInfo*>(info)->stage.module =
shader_modules[(Hash) info->stage.module];
auto result = shader_modules.find_object((Hash) info->stage.module);
if (!result.second)
{
LOGE("Could not find shader module %016llx in cache.\n",
static_cast<unsigned long long>((Hash) info->stage.module));
}
const_cast<VkComputePipelineCreateInfo*>(info)->stage.module = result.first;
}

template <typename DerivedInfo>
Expand Down Expand Up @@ -1251,7 +1280,8 @@ struct ThreadedReplayer : StateCreatorInterface
ENQUEUE_OUT_OF_RANGE_PARENT_PIPELINES = 3,
ENQUEUE_SHADER_MODULE_SECONDARY_OFFSET = 4,
ENQUEUE_DERIVED_PIPELINES_OFFSET = 5,
PASS_COUNT = 6
MAINTAIN_SHADER_MODULE_LRU_CACHE = 6,
PASS_COUNT = 7
};

auto outside_range_hashes = make_shared<unordered_set<Hash>>();
Expand All @@ -1267,6 +1297,7 @@ struct ThreadedReplayer : StateCreatorInterface
// - ...
// - Pass 6, chunk 0 (reclaim memory for context 0)
// - Pass 6, chunk 1 (reclaim memory for context 1)
// - Pass 7, sync point
// - Pass 0, chunk 2 (context 0)
// - Pass 1, chunk 3 (context 1)

Expand Down Expand Up @@ -1506,6 +1537,32 @@ struct ThreadedReplayer : StateCreatorInterface
}
}});

if (memory_index == 0)
{
work.push_back({ get_order_index(MAINTAIN_SHADER_MODULE_LRU_CACHE),
[this]() {
for (unsigned i = 0; i < NUM_PIPELINE_MEMORY_CONTEXTS; i++)
sync_worker_memory_context(i);

// Now all worker threads are drained, so we can maintain the shader module LRU cache.
shader_modules.prune_cache([this](Hash hash, VkShaderModule module) {
assert(enqueued_shader_modules.count((VkShaderModule) hash) != 0);
//LOGI("Removing shader module %016llx.\n", static_cast<unsigned long long>(hash));
enqueued_shader_modules.erase((VkShaderModule) hash);
if (module != VK_NULL_HANDLE)
vkDestroyShaderModule(device->get_device(), module, nullptr);
});

// Need to forget that we have seen an object before so we can replay the same object multiple times.
for (auto &per_thread : per_thread_data)
if (per_thread.per_thread_replayers)
per_thread.per_thread_replayers[SHADER_MODULE_MEMORY_CONTEXT].forget_handle_references();

assert(enqueued_shader_modules.empty());
assert(shader_modules.get_current_object_count() == 0);
}});
}

memory_index = (memory_index + 1) % NUM_PIPELINE_MEMORY_CONTEXTS;
}
}
Expand Down Expand Up @@ -1540,7 +1597,9 @@ struct ThreadedReplayer : StateCreatorInterface
std::unordered_map<Hash, VkSampler> samplers;
std::unordered_map<Hash, VkDescriptorSetLayout> layouts;
std::unordered_map<Hash, VkPipelineLayout> pipeline_layouts;
std::unordered_map<Hash, VkShaderModule> shader_modules;

ObjectCache<VkShaderModule> shader_modules;

std::unordered_map<Hash, VkRenderPass> render_passes;
std::unordered_map<Hash, VkPipeline> compute_pipelines;
std::unordered_map<Hash, VkPipeline> graphics_pipelines;
Expand Down Expand Up @@ -1649,6 +1708,7 @@ static void print_help()
"\t[--on-disk-pipeline-cache <path>]\n"
"\t[--graphics-pipeline-range <start> <end>]\n"
"\t[--compute-pipeline-range <start> <end>]\n"
"\t[--shader-cache-size <value (MiB)>]\n"
EXTRA_OPTIONS
"\t<Database>\n");
}
Expand Down Expand Up @@ -1966,6 +2026,7 @@ static int run_normal_process(ThreadedReplayer &replayer, const vector<const cha
compute_start_index = start_index;
}

assert(hashes);
hashes->resize(resource_hash_count);

if (!resolver->get_hash_list_for_resource_tag(tag, &resource_hash_count, hashes->data()))
Expand Down Expand Up @@ -2033,7 +2094,7 @@ static int run_normal_process(ThreadedReplayer &replayer, const vector<const cha
replayer.samplers.size() +
replayer.layouts.size() +
replayer.pipeline_layouts.size() +
replayer.shader_modules.size() +
replayer.shader_modules.get_current_object_count() +
replayer.render_passes.size() +
replayer.compute_pipelines.size() +
replayer.graphics_pipelines.size();
Expand Down Expand Up @@ -2070,7 +2131,7 @@ static int run_normal_process(ThreadedReplayer &replayer, const vector<const cha
LOGI(" samplers: %7lu\n", (unsigned long)replayer.samplers.size());
LOGI(" descriptor set layouts:%7lu\n", (unsigned long)replayer.layouts.size());
LOGI(" pipeline layouts: %7lu\n", (unsigned long)replayer.pipeline_layouts.size());
LOGI(" shader modules: %7lu\n", (unsigned long)replayer.shader_modules.size());
LOGI(" shader modules: %7lu\n", (unsigned long)replayer.shader_modules.get_current_object_count());
LOGI(" render passes: %7lu\n", (unsigned long)replayer.render_passes.size());
LOGI(" compute pipelines: %7lu\n", (unsigned long)replayer.compute_pipelines.size());
LOGI(" graphics pipelines: %7lu\n", (unsigned long)replayer.graphics_pipelines.size());
Expand Down Expand Up @@ -2145,6 +2206,8 @@ int main(int argc, char *argv[])
#endif
#endif

cbs.add("--shader-cache-size", [&](CLIParser &parser) { replayer_opts.shader_cache_size_mb = parser.next_uint(); });

cbs.error_handler = [] { print_help(); };

CLIParser parser(move(cbs), argc - 1, argv + 1);
Expand Down
3 changes: 3 additions & 0 deletions cli/fossilize_replay_linux.hpp
Original file line number Diff line number Diff line change
Expand Up @@ -362,6 +362,9 @@ static int run_master_process(const VulkanDevice::Options &opts,
Global::base_replayer_options = replayer_opts;
Global::databases = databases;
unsigned processes = replayer_opts.num_threads;

// Split shader cache overhead across all processes.
Global::base_replayer_options.shader_cache_size_mb /= max(Global::base_replayer_options.num_threads, 1u);
Global::base_replayer_options.num_threads = 1;

// Try to map the shared control block.
Expand Down
8 changes: 7 additions & 1 deletion cli/fossilize_replay_windows.hpp
Original file line number Diff line number Diff line change
Expand Up @@ -386,6 +386,9 @@ bool ProcessProgress::start_child_process()
// We're supposed to populate the driver caches here first and foremost.
}

cmdline += " --shader-cache-size ";
cmdline += std::to_string(Global::base_replayer_options.shader_cache_size);

// Create custom named pipes which can be inherited by our child processes.
SECURITY_ATTRIBUTES attrs = {};
attrs.bInheritHandle = TRUE;
Expand Down Expand Up @@ -566,10 +569,13 @@ static int run_master_process(const VulkanDevice::Options &opts,
Global::base_replayer_options = replayer_opts;
Global::databases = databases;
unsigned processes = replayer_opts.num_threads;
Global::base_replayer_options.num_threads = 1;
Global::shm_name = shm_name;
Global::shm_mutex_name = shm_mutex_name;

// Split shader cache overhead across all processes.
Global::base_replayer_options.shader_cache_size_mb /= max(Global::base_replayer_options.num_threads, 1u);
Global::base_replayer_options.num_threads = 1;

Global::job_handle = CreateJobObjectA(nullptr, nullptr);
if (!Global::job_handle)
{
Expand Down
17 changes: 17 additions & 0 deletions fossilize.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -158,6 +158,7 @@ struct StateReplayer::Impl
std::unordered_map<Hash, VkPipeline> replayed_graphics_pipelines;

void copy_handle_references(const Impl &impl);
void forget_handle_references();
bool parse_samplers(StateCreatorInterface &iface, const Value &samplers) FOSSILIZE_WARN_UNUSED;
bool parse_descriptor_set_layouts(StateCreatorInterface &iface, const Value &layouts) FOSSILIZE_WARN_UNUSED;
bool parse_pipeline_layouts(StateCreatorInterface &iface, const Value &layouts) FOSSILIZE_WARN_UNUSED;
Expand Down Expand Up @@ -2488,6 +2489,11 @@ void StateReplayer::copy_handle_references(const StateReplayer &replayer)
impl->copy_handle_references(*replayer.impl);
}

void StateReplayer::forget_handle_references()
{
impl->forget_handle_references();
}

void StateReplayer::Impl::copy_handle_references(const StateReplayer::Impl &other)
{
replayed_samplers = other.replayed_samplers;
Expand All @@ -2499,6 +2505,17 @@ void StateReplayer::Impl::copy_handle_references(const StateReplayer::Impl &othe
replayed_graphics_pipelines = other.replayed_graphics_pipelines;
}

void StateReplayer::Impl::forget_handle_references()
{
replayed_samplers.clear();
replayed_descriptor_set_layouts.clear();
replayed_pipeline_layouts.clear();
replayed_shader_modules.clear();
replayed_render_passes.clear();
replayed_compute_pipelines.clear();
replayed_graphics_pipelines.clear();
}

bool StateReplayer::Impl::parse(StateCreatorInterface &iface, DatabaseInterface *resolver, const void *buffer_, size_t total_size)
{
// All data after a string terminating '\0' is considered binary payload
Expand Down
2 changes: 2 additions & 0 deletions fossilize.hpp
Original file line number Diff line number Diff line change
Expand Up @@ -156,6 +156,8 @@ class StateReplayer
// Lets other StateReplayers have the same references to objects.
void copy_handle_references(const StateReplayer &replayer);

void forget_handle_references();

ScratchAllocator &get_allocator();

// Disable copies (and moves).
Expand Down
5 changes: 5 additions & 0 deletions test/CMakeLists.txt
Original file line number Diff line number Diff line change
Expand Up @@ -20,6 +20,11 @@ add_executable(multi-instance-and-device-test multi_instance_and_device_test.cpp
target_link_libraries(multi-instance-and-device-test cli-utils fossilize)
set_target_properties(multi-instance-and-device-test PROPERTIES LINK_FLAGS "${FOSSILIZE_LINK_FLAGS}")

add_executable(object-cache-test object_cache_test.cpp)
target_link_libraries(object-cache-test fossilize)
set_target_properties(object-cache-test PROPERTIES LINK_FLAGS "${FOSSILIZE_LINK_FLAGS}")
add_test(NAME object-cache-test COMMAND object-cache-test)

if (NOT WIN32)
add_executable(futex-test futex_test.cpp)
target_link_libraries(futex-test fossilize -pthread)
Expand Down
Loading

0 comments on commit a536eb9

Please sign in to comment.