From a91732ad2e17aad28bcabff659f69fe0d3315496 Mon Sep 17 00:00:00 2001 From: Michael Keiblinger Date: Thu, 5 Sep 2024 13:01:26 +0200 Subject: [PATCH] Add indexing tests --- driverapi/src/cmdqueue.cpp | 4 - tests/CMakeLists.txt | 3 +- tests/indexing/CMakeLists.txt | 11 ++ tests/indexing/main.cpp | 130 ++++++++++++++++++ tests/indexing/write_float.asm | 221 +++++++++++++++++++++++++++++++ tests/indexing/write_float.cu | 4 + tests/indexing/write_float.cubin | Bin 0 -> 2856 bytes tests/indexing/write_float.ptx | 37 ++++++ 8 files changed, 405 insertions(+), 5 deletions(-) create mode 100644 tests/indexing/CMakeLists.txt create mode 100644 tests/indexing/main.cpp create mode 100644 tests/indexing/write_float.asm create mode 100644 tests/indexing/write_float.cu create mode 100644 tests/indexing/write_float.cubin create mode 100644 tests/indexing/write_float.ptx diff --git a/driverapi/src/cmdqueue.cpp b/driverapi/src/cmdqueue.cpp index 2f06b6d..8ef7b09 100644 --- a/driverapi/src/cmdqueue.cpp +++ b/driverapi/src/cmdqueue.cpp @@ -295,8 +295,6 @@ libreCudaStatus_t NvCommandQueue::signalNotify(NvSignal *pSignal, NvU32 signalTa LIBRECUDA_ERR_PROPAGATE(enqueue( makeNvMethod(4, NVC6B5_SET_SEMAPHORE_A, 3), { - // little endian inside NvU32s but big endian across NvU32s for some reason... - // don't question nvidia's autism... U64_HI_32_BITS(pSignal), U64_LO_32_BITS(pSignal), @@ -417,7 +415,6 @@ libreCudaStatus_t NvCommandQueue::ensureEnoughLocalMem(LibreCUFunction function) LIBRECUDA_ERR_PROPAGATE(enqueue( makeNvMethod(1, NVC6C0_SET_SHADER_LOCAL_MEMORY_A, 2), { - // weird half big and little endian along int borders again... U64_HI_32_BITS(function->shader_local_memory_va), U64_LO_32_BITS(function->shader_local_memory_va) }, @@ -426,7 +423,6 @@ libreCudaStatus_t NvCommandQueue::ensureEnoughLocalMem(LibreCUFunction function) LIBRECUDA_ERR_PROPAGATE(enqueue( makeNvMethod(1, NVC6C0_SET_SHADER_LOCAL_MEMORY_NON_THROTTLED_A, 3), - // weird half big and little endian along int borders again... { U64_HI_32_BITS(bytes_per_tpc), U64_LO_32_BITS(bytes_per_tpc), diff --git a/tests/CMakeLists.txt b/tests/CMakeLists.txt index 7e7c97a..21e48a6 100644 --- a/tests/CMakeLists.txt +++ b/tests/CMakeLists.txt @@ -4,4 +4,5 @@ add_subdirectory(dynamic_shared_mem) add_subdirectory(compute_chronological_consistency) add_subdirectory(test_async_kernels) add_subdirectory(dma_chronological_consistency) -add_subdirectory(kernel_struct_param) \ No newline at end of file +add_subdirectory(kernel_struct_param) +add_subdirectory(indexing) \ No newline at end of file diff --git a/tests/indexing/CMakeLists.txt b/tests/indexing/CMakeLists.txt new file mode 100644 index 0000000..8d68b07 --- /dev/null +++ b/tests/indexing/CMakeLists.txt @@ -0,0 +1,11 @@ +add_executable( + test_indexing + main.cpp +) +target_link_libraries( + test_indexing + PRIVATE + driverapi +) + +configure_file("${CMAKE_CURRENT_LIST_DIR}/write_float.cubin" ${CMAKE_BINARY_DIR}/tests/test_indexing/ COPYONLY) \ No newline at end of file diff --git a/tests/indexing/main.cpp b/tests/indexing/main.cpp new file mode 100644 index 0000000..7422f12 --- /dev/null +++ b/tests/indexing/main.cpp @@ -0,0 +1,130 @@ +#include + +#include +#include +#include +#include +#include + +inline void cudaCheck(libreCudaStatus_t error, const char *file, int line) { + if (error != LIBRECUDA_SUCCESS) { + const char *error_string; + libreCuGetErrorString(error, &error_string); + printf("[CUDA ERROR] at file %s:%d: %s\n", file, line, error_string); + exit(EXIT_FAILURE); + } +}; +#define CUDA_CHECK(err) (cudaCheck(err, __FILE__, __LINE__)) + +int main() { + CUDA_CHECK(libreCuInit(0)); + + int device_count{}; + CUDA_CHECK(libreCuDeviceGetCount(&device_count)); + std::cout << "Device count: " + std::to_string(device_count) << std::endl; + + LibreCUdevice device{}; + CUDA_CHECK(libreCuDeviceGet(&device, 0)); + + LibreCUcontext ctx{}; + CUDA_CHECK(libreCuCtxCreate_v2(&ctx, CU_CTX_SCHED_YIELD, device)); + + char name_buffer[256] = {}; + libreCuDeviceGetName(name_buffer, 256, device); + std::cout << "Device Name: " + std::string(name_buffer) << std::endl; + + LibreCUmodule module{}; + + // read cubin file + uint8_t *image; + size_t n_bytes; + { + std::ifstream input("write_float.cubin", std::ios::binary); + std::vector bytes( + (std::istreambuf_iterator(input)), + (std::istreambuf_iterator())); + input.close(); + image = new uint8_t[bytes.size()]; + memcpy(image, bytes.data(), bytes.size()); + n_bytes = bytes.size(); + } + CUDA_CHECK(libreCuModuleLoadData(&module, image, n_bytes)); + + // read functions + uint32_t num_funcs{}; + CUDA_CHECK(libreCuModuleGetFunctionCount(&num_funcs, module)); + std::cout << "Num functions: " << num_funcs << std::endl; + + auto *functions = new LibreCUFunction[num_funcs]; + CUDA_CHECK(libreCuModuleEnumerateFunctions(functions, num_funcs, module)); + + for (size_t i = 0; i < num_funcs; i++) { + LibreCUFunction func = functions[i]; + const char *func_name{}; + CUDA_CHECK(libreCuFuncGetName(&func_name, func)); + std::cout << " function \"" << func_name << "\"" << std::endl; + } + + delete[] functions; + + // find function + LibreCUFunction func{}; + CUDA_CHECK(libreCuModuleGetFunction(&func, module, "write_float")); + + // set dynamic shared memory + CUDA_CHECK(libreCuFuncSetAttribute(func, CU_FUNC_ATTRIBUTE_MAX_DYNAMIC_SHARED_SIZE_BYTES, 8192)); + + // create stream + LibreCUstream stream{}; + CUDA_CHECK(libreCuStreamCreate(&stream, 0)); + + void *float_dst_va{}; + size_t n_elements = 50256 * 768; + CUDA_CHECK(libreCuMemAlloc(&float_dst_va, n_elements * sizeof(float), true)); + + auto *host_dst = new float[n_elements]; + + void *params[] = { + &float_dst_va, // dst + &n_elements + }; + CUDA_CHECK( + libreCuLaunchKernel(func, + n_elements/256, 1, 1, + 256, 1, 1, + 8192, + stream, + params, sizeof(params) / sizeof(void *), + nullptr + ) + ); + CUDA_CHECK(libreCuMemCpy(host_dst, float_dst_va, n_elements * sizeof(float), stream, false)); + + // dispatch built up command buffer to GPU + CUDA_CHECK(libreCuStreamCommence(stream)); + + // wait for work to complete + CUDA_CHECK(libreCuStreamAwait(stream)); + + for (size_t i = 0; i < n_elements; i++) { + if (host_dst[i] != 1.0) { + std::cerr << "Not all values were filled!" << std::endl; + break; + } + } + + // free memory + CUDA_CHECK(libreCuMemFree(float_dst_va)); + + delete[] host_dst; + + // destroy stream + CUDA_CHECK(libreCuStreamDestroy(stream)); + + // unload module + CUDA_CHECK(libreCuModuleUnload(module)); + + // destroy ctx + CUDA_CHECK(libreCuCtxDestroy(ctx)); + return 0; +} \ No newline at end of file diff --git a/tests/indexing/write_float.asm b/tests/indexing/write_float.asm new file mode 100644 index 0000000..f633756 --- /dev/null +++ b/tests/indexing/write_float.asm @@ -0,0 +1,221 @@ + .headerflags @"EF_CUDA_TEXMODE_UNIFIED EF_CUDA_64BIT_ADDRESS EF_CUDA_SM80 EF_CUDA_VIRTUAL_SM(EF_CUDA_SM80)" + .elftype @"ET_EXEC" + + +//--------------------- .debug_frame -------------------------- + .section .debug_frame,"",@progbits +.debug_frame: + /*0000*/ .byte 0xff, 0xff, 0xff, 0xff, 0x24, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0xff, 0xff, 0xff, 0xff + /*0010*/ .byte 0xff, 0xff, 0xff, 0xff, 0x03, 0x00, 0x04, 0x7c, 0xff, 0xff, 0xff, 0xff, 0x0f, 0x0c, 0x81, 0x80 + /*0020*/ .byte 0x80, 0x28, 0x00, 0x08, 0xff, 0x81, 0x80, 0x28, 0x08, 0x81, 0x80, 0x80, 0x28, 0x00, 0x00, 0x00 + /*0030*/ .byte 0xff, 0xff, 0xff, 0xff, 0x34, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00 + /*0040*/ .byte 0x00, 0x00, 0x00, 0x00 + /*0044*/ .dword write_float + /*004c*/ .byte 0x80, 0x01, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x04, 0x04, 0x00, 0x00, 0x00, 0x04, 0x24, 0x00 + /*005c*/ .byte 0x00, 0x00, 0x0c, 0x81, 0x80, 0x80, 0x28, 0x00, 0x04, 0xfc, 0xff, 0xff, 0x3f, 0x00, 0x00, 0x00 + /*006c*/ .byte 0x00, 0x00, 0x00, 0x00 + + +//--------------------- .nv.info -------------------------- + .section .nv.info,"",@"SHT_CUDA_INFO" + .align 4 + + + //----- nvinfo : EIATTR_REGCOUNT + .align 4 + /*0000*/ .byte 0x04, 0x2f + /*0002*/ .short (.L_1 - .L_0) + .align 4 +.L_0: + /*0004*/ .word index@(write_float) + /*0008*/ .word 0x0000000a + + + //----- nvinfo : EIATTR_MIN_STACK_SIZE + .align 4 +.L_1: + /*000c*/ .byte 0x04, 0x12 + /*000e*/ .short (.L_3 - .L_2) + .align 4 +.L_2: + /*0010*/ .word index@(write_float) + /*0014*/ .word 0x00000000 + + + //----- nvinfo : EIATTR_FRAME_SIZE + .align 4 +.L_3: + /*0018*/ .byte 0x04, 0x11 + /*001a*/ .short (.L_5 - .L_4) + .align 4 +.L_4: + /*001c*/ .word index@(write_float) + /*0020*/ .word 0x00000000 + + + //----- nvinfo : EIATTR_MIN_STACK_SIZE + .align 4 +.L_5: + /*0024*/ .byte 0x04, 0x12 + /*0026*/ .short (.L_7 - .L_6) + .align 4 +.L_6: + /*0028*/ .word index@(write_float) + /*002c*/ .word 0x00000000 +.L_7: + + +//--------------------- .nv.info.write_float -------------------------- + .section .nv.info.write_float,"",@"SHT_CUDA_INFO" + .sectionflags @"" + .align 4 + + + //----- nvinfo : EIATTR_CUDA_API_VERSION + .align 4 + /*0000*/ .byte 0x04, 0x37 + /*0002*/ .short (.L_9 - .L_8) +.L_8: + /*0004*/ .word 0x0000007c + + + //----- nvinfo : EIATTR_SW2861232_WAR + .align 4 +.L_9: + /*0008*/ .byte 0x01, 0x35 + .zero 2 + + + //----- nvinfo : EIATTR_PARAM_CBANK + .align 4 + /*000c*/ .byte 0x04, 0x0a + /*000e*/ .short (.L_11 - .L_10) + .align 4 +.L_10: + /*0010*/ .word index@(.nv.constant0.write_float) + /*0014*/ .short 0x0160 + /*0016*/ .short 0x0010 + + + //----- nvinfo : EIATTR_CBANK_PARAM_SIZE + .align 4 +.L_11: + /*0018*/ .byte 0x03, 0x19 + /*001a*/ .short 0x0010 + + + //----- nvinfo : EIATTR_KPARAM_INFO + .align 4 + /*001c*/ .byte 0x04, 0x17 + /*001e*/ .short (.L_13 - .L_12) +.L_12: + /*0020*/ .word 0x00000000 + /*0024*/ .short 0x0001 + /*0026*/ .short 0x0008 + /*0028*/ .byte 0x00, 0xf0, 0x21, 0x00 + + + //----- nvinfo : EIATTR_KPARAM_INFO + .align 4 +.L_13: + /*002c*/ .byte 0x04, 0x17 + /*002e*/ .short (.L_15 - .L_14) +.L_14: + /*0030*/ .word 0x00000000 + /*0034*/ .short 0x0000 + /*0036*/ .short 0x0000 + /*0038*/ .byte 0x00, 0xf0, 0x21, 0x00 + + + //----- nvinfo : EIATTR_MAXREG_COUNT + .align 4 +.L_15: + /*003c*/ .byte 0x03, 0x1b + /*003e*/ .short 0x00ff + + + //----- nvinfo : EIATTR_EXIT_INSTR_OFFSETS + .align 4 + /*0040*/ .byte 0x04, 0x1c + /*0042*/ .short (.L_17 - .L_16) + + + // ....[0].... +.L_16: + /*0044*/ .word 0x00000090 +.L_17: + + +//--------------------- .nv.callgraph -------------------------- + .section .nv.callgraph,"",@"SHT_CUDA_CALLGRAPH" + .align 4 + .sectionentsize 8 + .align 4 + /*0000*/ .word 0x00000000 + .align 4 + /*0004*/ .word 0xffffffff + .align 4 + /*0008*/ .word 0x00000000 + .align 4 + /*000c*/ .word 0xfffffffe + .align 4 + /*0010*/ .word 0x00000000 + .align 4 + /*0014*/ .word 0xfffffffd + .align 4 + /*0018*/ .word 0x00000000 + .align 4 + /*001c*/ .word 0xfffffffc + + +//--------------------- .nv.rel.action -------------------------- + .section .nv.rel.action,"",@"SHT_CUDA_RELOCINFO" + .align 8 + .sectionentsize 8 + /*0000*/ .byte 0x73, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x11, 0x25, 0x00, 0x05, 0x36 + + +//--------------------- .nv.constant0.write_float -------------------------- + .section .nv.constant0.write_float,"a",@progbits + .sectionflags @"" + .align 4 +.nv.constant0.write_float: + .zero 368 + + +//--------------------- .text.write_float -------------------------- + .section .text.write_float,"ax",@progbits + .sectioninfo @"SHI_REGISTERS=10" + .align 128 + .global write_float + .type write_float,@function + .size write_float,(.L_x_1 - write_float) + .other write_float,@"STO_CUDA_ENTRY STV_DEFAULT" +write_float: +.text.write_float: + /*0000*/ MOV R1, c[0x0][0x28] ; + /*0010*/ S2R R2, SR_CTAID.X ; + /*0020*/ HFMA2.MMA R5, -RZ, RZ, 0, 2.384185791015625e-07 ; + /*0030*/ MOV R7, 0x3f800000 ; + /*0040*/ ULDC.64 UR4, c[0x0][0x118] ; + /*0050*/ S2R R3, SR_TID.X ; + /*0060*/ IMAD R2, R2, c[0x0][0x0], R3 ; + /*0070*/ IMAD.WIDE.U32 R2, R2, R5, c[0x0][0x160] ; + /*0080*/ STG.E [R2.64], R7 ; + /*0090*/ EXIT ; +.L_x_0: + /*00a0*/ BRA `(.L_x_0); + /*00b0*/ NOP; + /*00c0*/ NOP; + /*00d0*/ NOP; + /*00e0*/ NOP; + /*00f0*/ NOP; + /*0100*/ NOP; + /*0110*/ NOP; + /*0120*/ NOP; + /*0130*/ NOP; + /*0140*/ NOP; + /*0150*/ NOP; + /*0160*/ NOP; + /*0170*/ NOP; +.L_x_1: diff --git a/tests/indexing/write_float.cu b/tests/indexing/write_float.cu new file mode 100644 index 0000000..cde69bb --- /dev/null +++ b/tests/indexing/write_float.cu @@ -0,0 +1,4 @@ +extern "C" __global__ void write_float(float *dst, size_t n) { + size_t tid = blockDim.x * blockIdx.x + threadIdx.x; + dst[tid] = 1.0f; +} \ No newline at end of file diff --git a/tests/indexing/write_float.cubin b/tests/indexing/write_float.cubin new file mode 100644 index 0000000000000000000000000000000000000000..c5f449eee66263005e3f534963cc926aff775095 GIT binary patch literal 2856 zcmeHJ&rcIk5T5S7O2vjPc#uXD#Xy3WN>tPvjX%(XCLFop0);jy?Pj|oB_u8}@ksm= zJn|2C@M1js7kD-vj3*8r)pfqN^V-K2O^k^q#7W-H%s2DRy!ZO{ZJ*wKFz@7YSBEGS zhu+YR*ii9s>co<;kYAuG-6WTvMamIXI`vKzM*cEeMt3(-U+bk#y|ubUmDXmZ(OPR$ zC8}*jl_y~%sx7TG+kVsw4S=1xAJ$d};FWf(6Zx%Z=5QpeHPPfJ(bd}W34}Z~kp$~}V@=+Zt_4~%`LlTzB)YLY7m(&ilSKN)` zX(}YU@pM6(Y5q0I&ni<>VY&)c(*o*ilFUi6x?G?WY-6QrEVU6*qn2&sH4Wx62`JLAW{Y^AaFX;eCWps-2 z*YAjfewgonIi5SA=d=HM zo1^m#p$7ZquFD3?1dquPn%s7DLD$(EqIc(MYLop%qCCO(=y{hMR8-|s5&VZ;kRh>4 zpFO(A{sEuQBzfl2TaU(jaZWh~zFz7dJn*x~jvuO06%Y5GEg4te#7FUOOQM${WL!GU z#pT4(I+}{wmxJS&*JPwcr@oY^UkVysF=|8f$m(ox+jRN7%2-U0+XW16H=YpyY zvi(vB_XUTGuSb@T?-0*vp#LY~U*I?-+u!U1zE@m#fM1YCCC%4;h428NXzpu|1<3v?0o@@ XHuFbW@XS0Vt)~`c!;Q}Jez)Hbs$oKD literal 0 HcmV?d00001 diff --git a/tests/indexing/write_float.ptx b/tests/indexing/write_float.ptx new file mode 100644 index 0000000..669cf65 --- /dev/null +++ b/tests/indexing/write_float.ptx @@ -0,0 +1,37 @@ +// +// Generated by NVIDIA NVVM Compiler +// +// Compiler Build ID: CL-34097967 +// Cuda compilation tools, release 12.4, V12.4.131 +// Based on NVVM 7.0.1 +// + +.version 8.4 +.target sm_80 +.address_size 64 + + // .globl write_float + +.visible .entry write_float( + .param .u64 write_float_param_0, + .param .u64 write_float_param_1 +) +{ + .reg .b32 %r<6>; + .reg .b64 %rd<5>; + + + ld.param.u64 %rd1, [write_float_param_0]; + cvta.to.global.u64 %rd2, %rd1; + mov.u32 %r1, %ntid.x; + mov.u32 %r2, %ctaid.x; + mov.u32 %r3, %tid.x; + mad.lo.s32 %r4, %r1, %r2, %r3; + mul.wide.u32 %rd3, %r4, 4; + add.s64 %rd4, %rd2, %rd3; + mov.u32 %r5, 1065353216; + st.global.u32 [%rd4], %r5; + ret; + +} +