From 3187e8cc8bf4d7fdca5a6ceaa837867fbf23f36c Mon Sep 17 00:00:00 2001 From: Michael Keiblinger Date: Sun, 1 Sep 2024 17:01:22 +0200 Subject: [PATCH] Add support for struct kernel parameters --- driverapi/src/cmdqueue.cpp | 16 +- tests/CMakeLists.txt | 3 +- .../CMakeLists.txt | 2 +- tests/dynamic_shared_mem/CMakeLists.txt | 2 +- tests/kernel_struct_param/CMakeLists.txt | 11 + tests/kernel_struct_param/main.cpp | 126 +++++++++++ .../kernel_struct_param/read_from_struct.asm | 213 ++++++++++++++++++ tests/kernel_struct_param/read_from_struct.cu | 10 + .../read_from_struct.cubin | Bin 0 -> 2728 bytes .../kernel_struct_param/read_from_struct.ptx | 31 +++ tests/test_async_kernels/CMakeLists.txt | 3 +- 11 files changed, 412 insertions(+), 5 deletions(-) create mode 100644 tests/kernel_struct_param/CMakeLists.txt create mode 100644 tests/kernel_struct_param/main.cpp create mode 100644 tests/kernel_struct_param/read_from_struct.asm create mode 100644 tests/kernel_struct_param/read_from_struct.cu create mode 100644 tests/kernel_struct_param/read_from_struct.cubin create mode 100644 tests/kernel_struct_param/read_from_struct.ptx diff --git a/driverapi/src/cmdqueue.cpp b/driverapi/src/cmdqueue.cpp index 55da928..39e886c 100644 --- a/driverapi/src/cmdqueue.cpp +++ b/driverapi/src/cmdqueue.cpp @@ -545,7 +545,21 @@ NvCommandQueue::launchFunction(LibreCUFunction function, kernargs_buf[j++] = param_value; break; } - default: LIBRECUDA_FAIL(LIBRECUDA_ERROR_INVALID_VALUE) + default: { + if (param_size % sizeof(NvU32) != 0) { + // cuda encodes everything with these 32-bit words. The fact that this would be allowed is highly + // implausible given that even most c compilers pad struct lengths to multiples of 4 anyway, + // so cuda doing it any different would be highly implausible + LIBRECUDA_DEBUG("Encountered kernel with array parameter with size % 4 != 0! This should not be possible"); + LIBRECUDA_FAIL(LIBRECUDA_ERROR_INVALID_VALUE); + } + auto *param_ptr = reinterpret_cast(params[i]); + size_t num_words = param_size / sizeof(NvU32); + for (size_t k = 0; k < num_words; k++) { + kernargs_buf[j++] = param_ptr[k]; + } + break; + } } } } diff --git a/tests/CMakeLists.txt b/tests/CMakeLists.txt index 21cdfd8..7e7c97a 100644 --- a/tests/CMakeLists.txt +++ b/tests/CMakeLists.txt @@ -3,4 +3,5 @@ add_subdirectory(memcopy) add_subdirectory(dynamic_shared_mem) add_subdirectory(compute_chronological_consistency) add_subdirectory(test_async_kernels) -add_subdirectory(dma_chronological_consistency) \ No newline at end of file +add_subdirectory(dma_chronological_consistency) +add_subdirectory(kernel_struct_param) \ No newline at end of file diff --git a/tests/compute_chronological_consistency/CMakeLists.txt b/tests/compute_chronological_consistency/CMakeLists.txt index c91c630..8c57761 100644 --- a/tests/compute_chronological_consistency/CMakeLists.txt +++ b/tests/compute_chronological_consistency/CMakeLists.txt @@ -8,4 +8,4 @@ target_link_libraries( driverapi ) -configure_file("${CMAKE_CURRENT_LIST_DIR}/write_float.cubin" ${CMAKE_BINARY_DIR}/tests/write_float COPYONLY) \ No newline at end of file +configure_file("${CMAKE_CURRENT_LIST_DIR}/write_float.cubin" ${CMAKE_BINARY_DIR}/tests/compute_chronological_consistency COPYONLY) \ No newline at end of file diff --git a/tests/dynamic_shared_mem/CMakeLists.txt b/tests/dynamic_shared_mem/CMakeLists.txt index 963d3b0..13f5228 100644 --- a/tests/dynamic_shared_mem/CMakeLists.txt +++ b/tests/dynamic_shared_mem/CMakeLists.txt @@ -8,4 +8,4 @@ target_link_libraries( driverapi ) -configure_file("${CMAKE_CURRENT_LIST_DIR}/write_float.cubin" ${CMAKE_BINARY_DIR}/tests/write_float COPYONLY) \ No newline at end of file +configure_file("${CMAKE_CURRENT_LIST_DIR}/write_float.cubin" ${CMAKE_BINARY_DIR}/tests/dynamic_shared_mem COPYONLY) \ No newline at end of file diff --git a/tests/kernel_struct_param/CMakeLists.txt b/tests/kernel_struct_param/CMakeLists.txt new file mode 100644 index 0000000..cd483b3 --- /dev/null +++ b/tests/kernel_struct_param/CMakeLists.txt @@ -0,0 +1,11 @@ +add_executable( + test_kernel_struct_param + main.cpp +) +target_link_libraries( + test_kernel_struct_param + PRIVATE + driverapi +) + +configure_file("${CMAKE_CURRENT_LIST_DIR}/read_from_struct.cubin" ${CMAKE_BINARY_DIR}/tests/kernel_struct_param COPYONLY) \ No newline at end of file diff --git a/tests/kernel_struct_param/main.cpp b/tests/kernel_struct_param/main.cpp new file mode 100644 index 0000000..4033ab7 --- /dev/null +++ b/tests/kernel_struct_param/main.cpp @@ -0,0 +1,126 @@ +#include + +#include +#include +#include +#include +#include + +inline void cudaCheck(libreCudaStatus_t error, const char *file, int line) { + if (error != LIBRECUDA_SUCCESS) { + const char *error_string; + libreCuGetErrorString(error, &error_string); + printf("[CUDA ERROR] at file %s:%d: %s\n", file, line, error_string); + exit(EXIT_FAILURE); + } +}; +#define CUDA_CHECK(err) (cudaCheck(err, __FILE__, __LINE__)) + +struct struct_t { + int x, y, z; + int w, h, d; + char str[32]; + char me_ugly; +}; +static_assert(sizeof(struct_t) == 60); + +int main() { + CUDA_CHECK(libreCuInit(0)); + + int device_count{}; + CUDA_CHECK(libreCuDeviceGetCount(&device_count)); + std::cout << "Device count: " + std::to_string(device_count) << std::endl; + + LibreCUdevice device{}; + CUDA_CHECK(libreCuDeviceGet(&device, 0)); + + LibreCUcontext ctx{}; + CUDA_CHECK(libreCuCtxCreate_v2(&ctx, CU_CTX_SCHED_YIELD, device)); + + char name_buffer[256] = {}; + libreCuDeviceGetName(name_buffer, 256, device); + std::cout << "Device Name: " + std::string(name_buffer) << std::endl; + LibreCUmodule module{}; + + // read cubin file + uint8_t *image; + size_t n_bytes; + { + std::ifstream input("read_from_struct.cubin", std::ios::binary); + std::vector bytes( + (std::istreambuf_iterator(input)), + (std::istreambuf_iterator())); + input.close(); + image = new uint8_t[bytes.size()]; + memcpy(image, bytes.data(), bytes.size()); + n_bytes = bytes.size(); + } + CUDA_CHECK(libreCuModuleLoadData(&module, image, n_bytes)); + + // read functions + uint32_t num_funcs{}; + CUDA_CHECK(libreCuModuleGetFunctionCount(&num_funcs, module)); + std::cout << "Num functions: " << num_funcs << std::endl; + + auto *functions = new LibreCUFunction[num_funcs]; + CUDA_CHECK(libreCuModuleEnumerateFunctions(functions, num_funcs, module)); + + for (size_t i = 0; i < num_funcs; i++) { + LibreCUFunction func = functions[i]; + const char *func_name{}; + CUDA_CHECK(libreCuFuncGetName(&func_name, func)); + std::cout << " function \"" << func_name << "\"" << std::endl; + } + + delete[] functions; + + // find function + LibreCUFunction func{}; + CUDA_CHECK(libreCuModuleGetFunction(&func, module, "read_from_struct")); + // create stream + LibreCUstream stream{}; + CUDA_CHECK(libreCuStreamCreate(&stream, 0)); + + void *w_dst_va{}; + CUDA_CHECK(libreCuMemAlloc(&w_dst_va, sizeof(int), true)); + + struct_t s = { + .w=64, + }; + + void *params[] = { + &s, // struct + &w_dst_va, // dst + }; + + CUDA_CHECK( + libreCuLaunchKernel(func, + 1, 1, 1, + 1, 1, 1, + 8192, + stream, + params, sizeof(params) / sizeof(void *), + nullptr + ) + ); + + // dispatch built up command buffer to GPU + CUDA_CHECK(libreCuStreamCommence(stream)); + + // wait for work to complete + CUDA_CHECK(libreCuStreamAwait(stream)); + std::cout << "Dst value (post exec): " << *(int *) (w_dst_va) << std::endl; + + // free memory + CUDA_CHECK(libreCuMemFree(w_dst_va)); + + // destroy stream + CUDA_CHECK(libreCuStreamDestroy(stream)); + + // unload module + CUDA_CHECK(libreCuModuleUnload(module)); + + // destroy ctx + CUDA_CHECK(libreCuCtxDestroy(ctx)); + return 0; +} \ No newline at end of file diff --git a/tests/kernel_struct_param/read_from_struct.asm b/tests/kernel_struct_param/read_from_struct.asm new file mode 100644 index 0000000..9f373f3 --- /dev/null +++ b/tests/kernel_struct_param/read_from_struct.asm @@ -0,0 +1,213 @@ + .headerflags @"EF_CUDA_TEXMODE_UNIFIED EF_CUDA_64BIT_ADDRESS EF_CUDA_SM80 EF_CUDA_VIRTUAL_SM(EF_CUDA_SM80)" + .elftype @"ET_EXEC" + + +//--------------------- .debug_frame -------------------------- + .section .debug_frame,"",@progbits +.debug_frame: + /*0000*/ .byte 0xff, 0xff, 0xff, 0xff, 0x24, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0xff, 0xff, 0xff, 0xff + /*0010*/ .byte 0xff, 0xff, 0xff, 0xff, 0x03, 0x00, 0x04, 0x7c, 0xff, 0xff, 0xff, 0xff, 0x0f, 0x0c, 0x81, 0x80 + /*0020*/ .byte 0x80, 0x28, 0x00, 0x08, 0xff, 0x81, 0x80, 0x28, 0x08, 0x81, 0x80, 0x80, 0x28, 0x00, 0x00, 0x00 + /*0030*/ .byte 0xff, 0xff, 0xff, 0xff, 0x34, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00 + /*0040*/ .byte 0x00, 0x00, 0x00, 0x00 + /*0044*/ .dword read_from_struct + /*004c*/ .byte 0x00, 0x01, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x04, 0x04, 0x00, 0x00, 0x00, 0x04, 0x18, 0x00 + /*005c*/ .byte 0x00, 0x00, 0x0c, 0x81, 0x80, 0x80, 0x28, 0x00, 0x04, 0xfc, 0xff, 0xff, 0x3f, 0x00, 0x00, 0x00 + /*006c*/ .byte 0x00, 0x00, 0x00, 0x00 + + +//--------------------- .nv.info -------------------------- + .section .nv.info,"",@"SHT_CUDA_INFO" + .align 4 + + + //----- nvinfo : EIATTR_REGCOUNT + .align 4 + /*0000*/ .byte 0x04, 0x2f + /*0002*/ .short (.L_1 - .L_0) + .align 4 +.L_0: + /*0004*/ .word index@(read_from_struct) + /*0008*/ .word 0x00000008 + + + //----- nvinfo : EIATTR_MIN_STACK_SIZE + .align 4 +.L_1: + /*000c*/ .byte 0x04, 0x12 + /*000e*/ .short (.L_3 - .L_2) + .align 4 +.L_2: + /*0010*/ .word index@(read_from_struct) + /*0014*/ .word 0x00000000 + + + //----- nvinfo : EIATTR_FRAME_SIZE + .align 4 +.L_3: + /*0018*/ .byte 0x04, 0x11 + /*001a*/ .short (.L_5 - .L_4) + .align 4 +.L_4: + /*001c*/ .word index@(read_from_struct) + /*0020*/ .word 0x00000000 + + + //----- nvinfo : EIATTR_MIN_STACK_SIZE + .align 4 +.L_5: + /*0024*/ .byte 0x04, 0x12 + /*0026*/ .short (.L_7 - .L_6) + .align 4 +.L_6: + /*0028*/ .word index@(read_from_struct) + /*002c*/ .word 0x00000000 +.L_7: + + +//--------------------- .nv.info.read_from_struct -------------------------- + .section .nv.info.read_from_struct,"",@"SHT_CUDA_INFO" + .sectionflags @"" + .align 4 + + + //----- nvinfo : EIATTR_CUDA_API_VERSION + .align 4 + /*0000*/ .byte 0x04, 0x37 + /*0002*/ .short (.L_9 - .L_8) +.L_8: + /*0004*/ .word 0x0000007c + + + //----- nvinfo : EIATTR_SW2861232_WAR + .align 4 +.L_9: + /*0008*/ .byte 0x01, 0x35 + .zero 2 + + + //----- nvinfo : EIATTR_PARAM_CBANK + .align 4 + /*000c*/ .byte 0x04, 0x0a + /*000e*/ .short (.L_11 - .L_10) + .align 4 +.L_10: + /*0010*/ .word index@(.nv.constant0.read_from_struct) + /*0014*/ .short 0x0160 + /*0016*/ .short 0x0040 + + + //----- nvinfo : EIATTR_CBANK_PARAM_SIZE + .align 4 +.L_11: + /*0018*/ .byte 0x03, 0x19 + /*001a*/ .short 0x0040 + + + //----- nvinfo : EIATTR_KPARAM_INFO + .align 4 + /*001c*/ .byte 0x04, 0x17 + /*001e*/ .short (.L_13 - .L_12) +.L_12: + /*0020*/ .word 0x00000000 + /*0024*/ .short 0x0001 + /*0026*/ .short 0x0038 + /*0028*/ .byte 0x00, 0xf0, 0x21, 0x00 + + + //----- nvinfo : EIATTR_KPARAM_INFO + .align 4 +.L_13: + /*002c*/ .byte 0x04, 0x17 + /*002e*/ .short (.L_15 - .L_14) +.L_14: + /*0030*/ .word 0x00000000 + /*0034*/ .short 0x0000 + /*0036*/ .short 0x0000 + /*0038*/ .byte 0x00, 0xf0, 0xe1, 0x00 + + + //----- nvinfo : EIATTR_MAXREG_COUNT + .align 4 +.L_15: + /*003c*/ .byte 0x03, 0x1b + /*003e*/ .short 0x00ff + + + //----- nvinfo : EIATTR_EXIT_INSTR_OFFSETS + .align 4 + /*0040*/ .byte 0x04, 0x1c + /*0042*/ .short (.L_17 - .L_16) + + + // ....[0].... +.L_16: + /*0044*/ .word 0x00000060 +.L_17: + + +//--------------------- .nv.callgraph -------------------------- + .section .nv.callgraph,"",@"SHT_CUDA_CALLGRAPH" + .align 4 + .sectionentsize 8 + .align 4 + /*0000*/ .word 0x00000000 + .align 4 + /*0004*/ .word 0xffffffff + .align 4 + /*0008*/ .word 0x00000000 + .align 4 + /*000c*/ .word 0xfffffffe + .align 4 + /*0010*/ .word 0x00000000 + .align 4 + /*0014*/ .word 0xfffffffd + .align 4 + /*0018*/ .word 0x00000000 + .align 4 + /*001c*/ .word 0xfffffffc + + +//--------------------- .nv.rel.action -------------------------- + .section .nv.rel.action,"",@"SHT_CUDA_RELOCINFO" + .align 8 + .sectionentsize 8 + /*0000*/ .byte 0x73, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x11, 0x25, 0x00, 0x05, 0x36 + + +//--------------------- .nv.constant0.read_from_struct -------------------------- + .section .nv.constant0.read_from_struct,"a",@progbits + .sectionflags @"" + .align 4 +.nv.constant0.read_from_struct: + .zero 416 + + +//--------------------- .text.read_from_struct -------------------------- + .section .text.read_from_struct,"ax",@progbits + .sectioninfo @"SHI_REGISTERS=8" + .align 128 + .global read_from_struct + .type read_from_struct,@function + .size read_from_struct,(.L_x_1 - read_from_struct) + .other read_from_struct,@"STO_CUDA_ENTRY STV_DEFAULT" +read_from_struct: +.text.read_from_struct: + /*0000*/ IMAD.MOV.U32 R1, RZ, RZ, c[0x0][0x28] ; + /*0010*/ IMAD.MOV.U32 R5, RZ, RZ, c[0x0][0x16c] ; + /*0020*/ MOV R2, c[0x0][0x198] ; + /*0030*/ ULDC.64 UR4, c[0x0][0x118] ; + /*0040*/ MOV R3, c[0x0][0x19c] ; + /*0050*/ STG.E [R2.64], R5 ; + /*0060*/ EXIT ; +.L_x_0: + /*0070*/ BRA `(.L_x_0); + /*0080*/ NOP; + /*0090*/ NOP; + /*00a0*/ NOP; + /*00b0*/ NOP; + /*00c0*/ NOP; + /*00d0*/ NOP; + /*00e0*/ NOP; + /*00f0*/ NOP; +.L_x_1: diff --git a/tests/kernel_struct_param/read_from_struct.cu b/tests/kernel_struct_param/read_from_struct.cu new file mode 100644 index 0000000..a04764f --- /dev/null +++ b/tests/kernel_struct_param/read_from_struct.cu @@ -0,0 +1,10 @@ +struct struct_t { + int x, y, z; + int w, h, d; + char str[33]; + char me_ugly; +}; + +extern "C" __global__ void read_from_struct(struct_t s, int *pWout) { + *pWout = s.w; +} \ No newline at end of file diff --git a/tests/kernel_struct_param/read_from_struct.cubin b/tests/kernel_struct_param/read_from_struct.cubin new file mode 100644 index 0000000000000000000000000000000000000000..9bdec520a40b92ea9086f3b36c6d0ce3eb5c872b GIT binary patch literal 2728 zcmeHIJ8u&~5T5g`$q|gs2?dEr5hNpp0u%9c#|c3I2_*L**%#ZfBl)Gfixo$a;s}Wn z@e?8{{{~#q`jH>X1>|k_3rxM?!%3gVO*VqNK)_` z_Su$do*hmorin{Ay5u+7)m_N9d-?75RtNGyZ8yk! zHMd&Y@;a>&vUV!LI6gh%H(ak)ogS-n+J4}+gY_w4uhyiAf0a_LZSK^u6}MFbtkBHs zfcrB-<=keo?z!ED?7Hi9f=^c#D(8WeZNAO&yBG$fl53h}m)crjYm8xQ;pV z7Njt@Y&c*pJ0Q+xh>Ewj05Cc~_9Zy_0Oq+6#EO8j1ft%_{EllQ{f#xY7y1N5vTy;? zH}0@Q^HIP5<=@_cT+avruTKci=irkKxn4Si$LxQ!A+?9Uq9Dyu2{}yDE_FAj+B83pKZ8@{s_-a7(O-OjSWlVI3~@&=jQ%e(bdHr zNQlD-y&rMWT=}9+<3HiVOBz9Q;S3h1yQa$v28SF__ffrY^vaMNK-6QdwLs3T*}Npif#C`_0#_%r6k^3 s1u&oHN5~qWjtiyHeU; + .reg .b64 %rd<3>; + + + ld.param.u64 %rd1, [read_from_struct_param_1]; + ld.param.u32 %r1, [read_from_struct_param_0+12]; + cvta.to.global.u64 %rd2, %rd1; + st.global.u32 [%rd2], %r1; + ret; + +} + diff --git a/tests/test_async_kernels/CMakeLists.txt b/tests/test_async_kernels/CMakeLists.txt index 1efccd1..04fdcb1 100644 --- a/tests/test_async_kernels/CMakeLists.txt +++ b/tests/test_async_kernels/CMakeLists.txt @@ -1,6 +1,7 @@ add_executable( test_async_kernels main.cpp + ../kernel_struct_param/main.cpp ) target_link_libraries( test_async_kernels @@ -8,4 +9,4 @@ target_link_libraries( driverapi ) -configure_file("${CMAKE_CURRENT_LIST_DIR}/write_float.cubin" ${CMAKE_BINARY_DIR}/tests/write_float COPYONLY) \ No newline at end of file +configure_file("${CMAKE_CURRENT_LIST_DIR}/write_float.cubin" ${CMAKE_BINARY_DIR}/tests/test_async_kernels COPYONLY) \ No newline at end of file