Skip to content

Commit

Permalink
Add indexing tests
Browse files Browse the repository at this point in the history
  • Loading branch information
mikex86 committed Sep 5, 2024
1 parent d2805f0 commit a91732a
Show file tree
Hide file tree
Showing 8 changed files with 405 additions and 5 deletions.
4 changes: 0 additions & 4 deletions driverapi/src/cmdqueue.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -295,8 +295,6 @@ libreCudaStatus_t NvCommandQueue::signalNotify(NvSignal *pSignal, NvU32 signalTa
LIBRECUDA_ERR_PROPAGATE(enqueue(
makeNvMethod(4, NVC6B5_SET_SEMAPHORE_A, 3),
{
// little endian inside NvU32s but big endian across NvU32s for some reason...
// don't question nvidia's autism...
U64_HI_32_BITS(pSignal),
U64_LO_32_BITS(pSignal),

Expand Down Expand Up @@ -417,7 +415,6 @@ libreCudaStatus_t NvCommandQueue::ensureEnoughLocalMem(LibreCUFunction function)
LIBRECUDA_ERR_PROPAGATE(enqueue(
makeNvMethod(1, NVC6C0_SET_SHADER_LOCAL_MEMORY_A, 2),
{
// weird half big and little endian along int borders again...
U64_HI_32_BITS(function->shader_local_memory_va),
U64_LO_32_BITS(function->shader_local_memory_va)
},
Expand All @@ -426,7 +423,6 @@ libreCudaStatus_t NvCommandQueue::ensureEnoughLocalMem(LibreCUFunction function)
LIBRECUDA_ERR_PROPAGATE(enqueue(
makeNvMethod(1, NVC6C0_SET_SHADER_LOCAL_MEMORY_NON_THROTTLED_A, 3),

// weird half big and little endian along int borders again...
{
U64_HI_32_BITS(bytes_per_tpc),
U64_LO_32_BITS(bytes_per_tpc),
Expand Down
3 changes: 2 additions & 1 deletion tests/CMakeLists.txt
Original file line number Diff line number Diff line change
Expand Up @@ -4,4 +4,5 @@ add_subdirectory(dynamic_shared_mem)
add_subdirectory(compute_chronological_consistency)
add_subdirectory(test_async_kernels)
add_subdirectory(dma_chronological_consistency)
add_subdirectory(kernel_struct_param)
add_subdirectory(kernel_struct_param)
add_subdirectory(indexing)
11 changes: 11 additions & 0 deletions tests/indexing/CMakeLists.txt
Original file line number Diff line number Diff line change
@@ -0,0 +1,11 @@
add_executable(
test_indexing
main.cpp
)
target_link_libraries(
test_indexing
PRIVATE
driverapi
)

configure_file("${CMAKE_CURRENT_LIST_DIR}/write_float.cubin" ${CMAKE_BINARY_DIR}/tests/test_indexing/ COPYONLY)
130 changes: 130 additions & 0 deletions tests/indexing/main.cpp
Original file line number Diff line number Diff line change
@@ -0,0 +1,130 @@
#include <librecuda.h>

#include <iostream>
#include <iomanip>
#include <vector>
#include <fstream>
#include <cstring>

inline void cudaCheck(libreCudaStatus_t error, const char *file, int line) {
if (error != LIBRECUDA_SUCCESS) {
const char *error_string;
libreCuGetErrorString(error, &error_string);
printf("[CUDA ERROR] at file %s:%d: %s\n", file, line, error_string);
exit(EXIT_FAILURE);
}
};
#define CUDA_CHECK(err) (cudaCheck(err, __FILE__, __LINE__))

int main() {
CUDA_CHECK(libreCuInit(0));

int device_count{};
CUDA_CHECK(libreCuDeviceGetCount(&device_count));
std::cout << "Device count: " + std::to_string(device_count) << std::endl;

LibreCUdevice device{};
CUDA_CHECK(libreCuDeviceGet(&device, 0));

LibreCUcontext ctx{};
CUDA_CHECK(libreCuCtxCreate_v2(&ctx, CU_CTX_SCHED_YIELD, device));

char name_buffer[256] = {};
libreCuDeviceGetName(name_buffer, 256, device);
std::cout << "Device Name: " + std::string(name_buffer) << std::endl;

LibreCUmodule module{};

// read cubin file
uint8_t *image;
size_t n_bytes;
{
std::ifstream input("write_float.cubin", std::ios::binary);
std::vector<uint8_t> bytes(
(std::istreambuf_iterator<char>(input)),
(std::istreambuf_iterator<char>()));
input.close();
image = new uint8_t[bytes.size()];
memcpy(image, bytes.data(), bytes.size());
n_bytes = bytes.size();
}
CUDA_CHECK(libreCuModuleLoadData(&module, image, n_bytes));

// read functions
uint32_t num_funcs{};
CUDA_CHECK(libreCuModuleGetFunctionCount(&num_funcs, module));
std::cout << "Num functions: " << num_funcs << std::endl;

auto *functions = new LibreCUFunction[num_funcs];
CUDA_CHECK(libreCuModuleEnumerateFunctions(functions, num_funcs, module));

for (size_t i = 0; i < num_funcs; i++) {
LibreCUFunction func = functions[i];
const char *func_name{};
CUDA_CHECK(libreCuFuncGetName(&func_name, func));
std::cout << " function \"" << func_name << "\"" << std::endl;
}

delete[] functions;

// find function
LibreCUFunction func{};
CUDA_CHECK(libreCuModuleGetFunction(&func, module, "write_float"));

// set dynamic shared memory
CUDA_CHECK(libreCuFuncSetAttribute(func, CU_FUNC_ATTRIBUTE_MAX_DYNAMIC_SHARED_SIZE_BYTES, 8192));

// create stream
LibreCUstream stream{};
CUDA_CHECK(libreCuStreamCreate(&stream, 0));

void *float_dst_va{};
size_t n_elements = 50256 * 768;
CUDA_CHECK(libreCuMemAlloc(&float_dst_va, n_elements * sizeof(float), true));

auto *host_dst = new float[n_elements];

void *params[] = {
&float_dst_va, // dst
&n_elements
};
CUDA_CHECK(
libreCuLaunchKernel(func,
n_elements/256, 1, 1,
256, 1, 1,
8192,
stream,
params, sizeof(params) / sizeof(void *),
nullptr
)
);
CUDA_CHECK(libreCuMemCpy(host_dst, float_dst_va, n_elements * sizeof(float), stream, false));

// dispatch built up command buffer to GPU
CUDA_CHECK(libreCuStreamCommence(stream));

// wait for work to complete
CUDA_CHECK(libreCuStreamAwait(stream));

for (size_t i = 0; i < n_elements; i++) {
if (host_dst[i] != 1.0) {
std::cerr << "Not all values were filled!" << std::endl;
break;
}
}

// free memory
CUDA_CHECK(libreCuMemFree(float_dst_va));

delete[] host_dst;

// destroy stream
CUDA_CHECK(libreCuStreamDestroy(stream));

// unload module
CUDA_CHECK(libreCuModuleUnload(module));

// destroy ctx
CUDA_CHECK(libreCuCtxDestroy(ctx));
return 0;
}
Loading

0 comments on commit a91732a

Please sign in to comment.