From a91732ad2e17aad28bcabff659f69fe0d3315496 Mon Sep 17 00:00:00 2001
From: Michael Keiblinger <michael.keiblinger@gmail.com>
Date: Thu, 5 Sep 2024 13:01:26 +0200
Subject: [PATCH] Add indexing tests

---
 driverapi/src/cmdqueue.cpp       |   4 -
 tests/CMakeLists.txt             |   3 +-
 tests/indexing/CMakeLists.txt    |  11 ++
 tests/indexing/main.cpp          | 130 ++++++++++++++++++
 tests/indexing/write_float.asm   | 221 +++++++++++++++++++++++++++++++
 tests/indexing/write_float.cu    |   4 +
 tests/indexing/write_float.cubin | Bin 0 -> 2856 bytes
 tests/indexing/write_float.ptx   |  37 ++++++
 8 files changed, 405 insertions(+), 5 deletions(-)
 create mode 100644 tests/indexing/CMakeLists.txt
 create mode 100644 tests/indexing/main.cpp
 create mode 100644 tests/indexing/write_float.asm
 create mode 100644 tests/indexing/write_float.cu
 create mode 100644 tests/indexing/write_float.cubin
 create mode 100644 tests/indexing/write_float.ptx
diff --git a/driverapi/src/cmdqueue.cpp b/driverapi/src/cmdqueue.cpp
index 2f06b6d..8ef7b09 100644
--- a/driverapi/src/cmdqueue.cpp
+++ b/driverapi/src/cmdqueue.cpp
@@ -295,8 +295,6 @@ libreCudaStatus_t NvCommandQueue::signalNotify(NvSignal *pSignal, NvU32 signalTa
             LIBRECUDA_ERR_PROPAGATE(enqueue(
                     makeNvMethod(4, NVC6B5_SET_SEMAPHORE_A, 3),
                     {
-                            // little endian inside NvU32s but big endian across NvU32s for some reason...
-                            // don't question nvidia's autism...
                             U64_HI_32_BITS(pSignal),
                             U64_LO_32_BITS(pSignal),
 
@@ -417,7 +415,6 @@ libreCudaStatus_t NvCommandQueue::ensureEnoughLocalMem(LibreCUFunction function)
         LIBRECUDA_ERR_PROPAGATE(enqueue(
                 makeNvMethod(1, NVC6C0_SET_SHADER_LOCAL_MEMORY_A, 2),
                 {
-                        // weird half big and little endian along int borders again...
                         U64_HI_32_BITS(function->shader_local_memory_va),
                         U64_LO_32_BITS(function->shader_local_memory_va)
                 },
@@ -426,7 +423,6 @@ libreCudaStatus_t NvCommandQueue::ensureEnoughLocalMem(LibreCUFunction function)
         LIBRECUDA_ERR_PROPAGATE(enqueue(
                 makeNvMethod(1, NVC6C0_SET_SHADER_LOCAL_MEMORY_NON_THROTTLED_A, 3),
 
-                // weird half big and little endian along int borders again...
                 {
                         U64_HI_32_BITS(bytes_per_tpc),
                         U64_LO_32_BITS(bytes_per_tpc),
diff --git a/tests/CMakeLists.txt b/tests/CMakeLists.txt
index 7e7c97a..21e48a6 100644
--- a/tests/CMakeLists.txt
+++ b/tests/CMakeLists.txt
@@ -4,4 +4,5 @@ add_subdirectory(dynamic_shared_mem)
 add_subdirectory(compute_chronological_consistency)
 add_subdirectory(test_async_kernels)
 add_subdirectory(dma_chronological_consistency)
-add_subdirectory(kernel_struct_param)
\ No newline at end of file
+add_subdirectory(kernel_struct_param)
+add_subdirectory(indexing)
\ No newline at end of file
diff --git a/tests/indexing/CMakeLists.txt b/tests/indexing/CMakeLists.txt
new file mode 100644
index 0000000..8d68b07
--- /dev/null
+++ b/tests/indexing/CMakeLists.txt
@@ -0,0 +1,11 @@
+add_executable(
+        test_indexing
+        main.cpp
+)
+target_link_libraries(
+        test_indexing
+        PRIVATE
+        driverapi
+)
+
+configure_file("${CMAKE_CURRENT_LIST_DIR}/write_float.cubin" ${CMAKE_BINARY_DIR}/tests/test_indexing/ COPYONLY)
\ No newline at end of file
diff --git a/tests/indexing/main.cpp b/tests/indexing/main.cpp
new file mode 100644
index 0000000..7422f12
--- /dev/null
+++ b/tests/indexing/main.cpp
@@ -0,0 +1,130 @@
+#include <librecuda.h>
+
+#include <iostream>
+#include <iomanip>
+#include <vector>
+#include <fstream>
+#include <cstring>
+
+inline void cudaCheck(libreCudaStatus_t error, const char *file, int line) {
+    if (error != LIBRECUDA_SUCCESS) {
+        const char *error_string;
+        libreCuGetErrorString(error, &error_string);
+        printf("[CUDA ERROR] at file %s:%d: %s\n", file, line, error_string);
+        exit(EXIT_FAILURE);
+    }
+};
+#define CUDA_CHECK(err) (cudaCheck(err, __FILE__, __LINE__))
+
+int main() {
+    CUDA_CHECK(libreCuInit(0));
+
+    int device_count{};
+    CUDA_CHECK(libreCuDeviceGetCount(&device_count));
+    std::cout << "Device count: " + std::to_string(device_count) << std::endl;
+
+    LibreCUdevice device{};
+    CUDA_CHECK(libreCuDeviceGet(&device, 0));
+
+    LibreCUcontext ctx{};
+    CUDA_CHECK(libreCuCtxCreate_v2(&ctx, CU_CTX_SCHED_YIELD, device));
+
+    char name_buffer[256] = {};
+    libreCuDeviceGetName(name_buffer, 256, device);
+    std::cout << "Device Name: " + std::string(name_buffer) << std::endl;
+
+    LibreCUmodule module{};
+
+    // read cubin file
+    uint8_t *image;
+    size_t n_bytes;
+    {
+        std::ifstream input("write_float.cubin", std::ios::binary);
+        std::vector<uint8_t> bytes(
+                (std::istreambuf_iterator<char>(input)),
+                (std::istreambuf_iterator<char>()));
+        input.close();
+        image = new uint8_t[bytes.size()];
+        memcpy(image, bytes.data(), bytes.size());
+        n_bytes = bytes.size();
+    }
+    CUDA_CHECK(libreCuModuleLoadData(&module, image, n_bytes));
+
+    // read functions
+    uint32_t num_funcs{};
+    CUDA_CHECK(libreCuModuleGetFunctionCount(&num_funcs, module));
+    std::cout << "Num functions: " << num_funcs << std::endl;
+
+    auto *functions = new LibreCUFunction[num_funcs];
+    CUDA_CHECK(libreCuModuleEnumerateFunctions(functions, num_funcs, module));
+
+    for (size_t i = 0; i < num_funcs; i++) {
+        LibreCUFunction func = functions[i];
+        const char *func_name{};
+        CUDA_CHECK(libreCuFuncGetName(&func_name, func));
+        std::cout << "  function \"" << func_name << "\"" << std::endl;
+    }
+
+    delete[] functions;
+
+    // find function
+    LibreCUFunction func{};
+    CUDA_CHECK(libreCuModuleGetFunction(&func, module, "write_float"));
+
+    // set dynamic shared memory
+    CUDA_CHECK(libreCuFuncSetAttribute(func, CU_FUNC_ATTRIBUTE_MAX_DYNAMIC_SHARED_SIZE_BYTES, 8192));
+
+    // create stream
+    LibreCUstream stream{};
+    CUDA_CHECK(libreCuStreamCreate(&stream, 0));
+
+    void *float_dst_va{};
+    size_t n_elements = 50256 * 768;
+    CUDA_CHECK(libreCuMemAlloc(&float_dst_va, n_elements * sizeof(float), true));
+
+    auto *host_dst = new float[n_elements];
+
+    void *params[] = {
+            &float_dst_va, // dst
+            &n_elements
+    };
+    CUDA_CHECK(
+            libreCuLaunchKernel(func,
+                                n_elements/256, 1, 1,
+                                256, 1, 1,
+                                8192,
+                                stream,
+                                params, sizeof(params) / sizeof(void *),
+                                nullptr
+            )
+    );
+    CUDA_CHECK(libreCuMemCpy(host_dst, float_dst_va, n_elements * sizeof(float), stream, false));
+
+    // dispatch built up command buffer to GPU
+    CUDA_CHECK(libreCuStreamCommence(stream));
+
+    // wait for work to complete
+    CUDA_CHECK(libreCuStreamAwait(stream));
+
+    for (size_t i = 0; i < n_elements; i++) {
+        if (host_dst[i] != 1.0) {
+            std::cerr << "Not all values were filled!" << std::endl;
+            break;
+        }
+    }
+
+    // free memory
+    CUDA_CHECK(libreCuMemFree(float_dst_va));
+
+    delete[] host_dst;
+
+    // destroy stream
+    CUDA_CHECK(libreCuStreamDestroy(stream));
+
+    // unload module
+    CUDA_CHECK(libreCuModuleUnload(module));
+
+    // destroy ctx
+    CUDA_CHECK(libreCuCtxDestroy(ctx));
+    return 0;
+}
\ No newline at end of file
diff --git a/tests/indexing/write_float.asm b/tests/indexing/write_float.asm
new file mode 100644
index 0000000..f633756
--- /dev/null
+++ b/tests/indexing/write_float.asm
@@ -0,0 +1,221 @@
+	.headerflags	@"EF_CUDA_TEXMODE_UNIFIED EF_CUDA_64BIT_ADDRESS EF_CUDA_SM80 EF_CUDA_VIRTUAL_SM(EF_CUDA_SM80)"
+	.elftype	@"ET_EXEC"
+
+
+//--------------------- .debug_frame              --------------------------
+	.section	.debug_frame,"",@progbits
+.debug_frame:
+        /*0000*/ 	.byte	0xff, 0xff, 0xff, 0xff, 0x24, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0xff, 0xff, 0xff, 0xff
+        /*0010*/ 	.byte	0xff, 0xff, 0xff, 0xff, 0x03, 0x00, 0x04, 0x7c, 0xff, 0xff, 0xff, 0xff, 0x0f, 0x0c, 0x81, 0x80
+        /*0020*/ 	.byte	0x80, 0x28, 0x00, 0x08, 0xff, 0x81, 0x80, 0x28, 0x08, 0x81, 0x80, 0x80, 0x28, 0x00, 0x00, 0x00
+        /*0030*/ 	.byte	0xff, 0xff, 0xff, 0xff, 0x34, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00
+        /*0040*/ 	.byte	0x00, 0x00, 0x00, 0x00
+        /*0044*/ 	.dword	write_float
+        /*004c*/ 	.byte	0x80, 0x01, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x04, 0x04, 0x00, 0x00, 0x00, 0x04, 0x24, 0x00
+        /*005c*/ 	.byte	0x00, 0x00, 0x0c, 0x81, 0x80, 0x80, 0x28, 0x00, 0x04, 0xfc, 0xff, 0xff, 0x3f, 0x00, 0x00, 0x00
+        /*006c*/ 	.byte	0x00, 0x00, 0x00, 0x00
+
+
+//--------------------- .nv.info                  --------------------------
+	.section	.nv.info,"",@"SHT_CUDA_INFO"
+	.align	4
+
+
+	//----- nvinfo : EIATTR_REGCOUNT
+	.align		4
+        /*0000*/ 	.byte	0x04, 0x2f
+        /*0002*/ 	.short	(.L_1 - .L_0)
+	.align		4
+.L_0:
+        /*0004*/ 	.word	index@(write_float)
+        /*0008*/ 	.word	0x0000000a
+
+
+	//----- nvinfo : EIATTR_MIN_STACK_SIZE
+	.align		4
+.L_1:
+        /*000c*/ 	.byte	0x04, 0x12
+        /*000e*/ 	.short	(.L_3 - .L_2)
+	.align		4
+.L_2:
+        /*0010*/ 	.word	index@(write_float)
+        /*0014*/ 	.word	0x00000000
+
+
+	//----- nvinfo : EIATTR_FRAME_SIZE
+	.align		4
+.L_3:
+        /*0018*/ 	.byte	0x04, 0x11
+        /*001a*/ 	.short	(.L_5 - .L_4)
+	.align		4
+.L_4:
+        /*001c*/ 	.word	index@(write_float)
+        /*0020*/ 	.word	0x00000000
+
+
+	//----- nvinfo : EIATTR_MIN_STACK_SIZE
+	.align		4
+.L_5:
+        /*0024*/ 	.byte	0x04, 0x12
+        /*0026*/ 	.short	(.L_7 - .L_6)
+	.align		4
+.L_6:
+        /*0028*/ 	.word	index@(write_float)
+        /*002c*/ 	.word	0x00000000
+.L_7:
+
+
+//--------------------- .nv.info.write_float      --------------------------
+	.section	.nv.info.write_float,"",@"SHT_CUDA_INFO"
+	.sectionflags	@""
+	.align	4
+
+
+	//----- nvinfo : EIATTR_CUDA_API_VERSION
+	.align		4
+        /*0000*/ 	.byte	0x04, 0x37
+        /*0002*/ 	.short	(.L_9 - .L_8)
+.L_8:
+        /*0004*/ 	.word	0x0000007c
+
+
+	//----- nvinfo : EIATTR_SW2861232_WAR
+	.align		4
+.L_9:
+        /*0008*/ 	.byte	0x01, 0x35
+	.zero		2
+
+
+	//----- nvinfo : EIATTR_PARAM_CBANK
+	.align		4
+        /*000c*/ 	.byte	0x04, 0x0a
+        /*000e*/ 	.short	(.L_11 - .L_10)
+	.align		4
+.L_10:
+        /*0010*/ 	.word	index@(.nv.constant0.write_float)
+        /*0014*/ 	.short	0x0160
+        /*0016*/ 	.short	0x0010
+
+
+	//----- nvinfo : EIATTR_CBANK_PARAM_SIZE
+	.align		4
+.L_11:
+        /*0018*/ 	.byte	0x03, 0x19
+        /*001a*/ 	.short	0x0010
+
+
+	//----- nvinfo : EIATTR_KPARAM_INFO
+	.align		4
+        /*001c*/ 	.byte	0x04, 0x17
+        /*001e*/ 	.short	(.L_13 - .L_12)
+.L_12:
+        /*0020*/ 	.word	0x00000000
+        /*0024*/ 	.short	0x0001
+        /*0026*/ 	.short	0x0008
+        /*0028*/ 	.byte	0x00, 0xf0, 0x21, 0x00
+
+
+	//----- nvinfo : EIATTR_KPARAM_INFO
+	.align		4
+.L_13:
+        /*002c*/ 	.byte	0x04, 0x17
+        /*002e*/ 	.short	(.L_15 - .L_14)
+.L_14:
+        /*0030*/ 	.word	0x00000000
+        /*0034*/ 	.short	0x0000
+        /*0036*/ 	.short	0x0000
+        /*0038*/ 	.byte	0x00, 0xf0, 0x21, 0x00
+
+
+	//----- nvinfo : EIATTR_MAXREG_COUNT
+	.align		4
+.L_15:
+        /*003c*/ 	.byte	0x03, 0x1b
+        /*003e*/ 	.short	0x00ff
+
+
+	//----- nvinfo : EIATTR_EXIT_INSTR_OFFSETS
+	.align		4
+        /*0040*/ 	.byte	0x04, 0x1c
+        /*0042*/ 	.short	(.L_17 - .L_16)
+
+
+	//   ....[0]....
+.L_16:
+        /*0044*/ 	.word	0x00000090
+.L_17:
+
+
+//--------------------- .nv.callgraph             --------------------------
+	.section	.nv.callgraph,"",@"SHT_CUDA_CALLGRAPH"
+	.align	4
+	.sectionentsize	8
+	.align		4
+        /*0000*/ 	.word	0x00000000
+	.align		4
+        /*0004*/ 	.word	0xffffffff
+	.align		4
+        /*0008*/ 	.word	0x00000000
+	.align		4
+        /*000c*/ 	.word	0xfffffffe
+	.align		4
+        /*0010*/ 	.word	0x00000000
+	.align		4
+        /*0014*/ 	.word	0xfffffffd
+	.align		4
+        /*0018*/ 	.word	0x00000000
+	.align		4
+        /*001c*/ 	.word	0xfffffffc
+
+
+//--------------------- .nv.rel.action            --------------------------
+	.section	.nv.rel.action,"",@"SHT_CUDA_RELOCINFO"
+	.align	8
+	.sectionentsize	8
+        /*0000*/ 	.byte	0x73, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x11, 0x25, 0x00, 0x05, 0x36
+
+
+//--------------------- .nv.constant0.write_float --------------------------
+	.section	.nv.constant0.write_float,"a",@progbits
+	.sectionflags	@""
+	.align	4
+.nv.constant0.write_float:
+	.zero		368
+
+
+//--------------------- .text.write_float         --------------------------
+	.section	.text.write_float,"ax",@progbits
+	.sectioninfo	@"SHI_REGISTERS=10"
+	.align	128
+        .global         write_float
+        .type           write_float,@function
+        .size           write_float,(.L_x_1 - write_float)
+        .other          write_float,@"STO_CUDA_ENTRY STV_DEFAULT"
+write_float:
+.text.write_float:
+        /*0000*/                   MOV R1, c[0x0][0x28] ;
+        /*0010*/                   S2R R2, SR_CTAID.X ;
+        /*0020*/                   HFMA2.MMA R5, -RZ, RZ, 0, 2.384185791015625e-07 ;
+        /*0030*/                   MOV R7, 0x3f800000 ;
+        /*0040*/                   ULDC.64 UR4, c[0x0][0x118] ;
+        /*0050*/                   S2R R3, SR_TID.X ;
+        /*0060*/                   IMAD R2, R2, c[0x0][0x0], R3 ;
+        /*0070*/                   IMAD.WIDE.U32 R2, R2, R5, c[0x0][0x160] ;
+        /*0080*/                   STG.E [R2.64], R7 ;
+        /*0090*/                   EXIT ;
+.L_x_0:
+        /*00a0*/                   BRA `(.L_x_0);
+        /*00b0*/                   NOP;
+        /*00c0*/                   NOP;
+        /*00d0*/                   NOP;
+        /*00e0*/                   NOP;
+        /*00f0*/                   NOP;
+        /*0100*/                   NOP;
+        /*0110*/                   NOP;
+        /*0120*/                   NOP;
+        /*0130*/                   NOP;
+        /*0140*/                   NOP;
+        /*0150*/                   NOP;
+        /*0160*/                   NOP;
+        /*0170*/                   NOP;
+.L_x_1:
diff --git a/tests/indexing/write_float.cu b/tests/indexing/write_float.cu
new file mode 100644
index 0000000..cde69bb
--- /dev/null
+++ b/tests/indexing/write_float.cu
@@ -0,0 +1,4 @@
+extern "C" __global__ void write_float(float *dst, size_t n) {
+    size_t tid = blockDim.x * blockIdx.x + threadIdx.x;
+    dst[tid] = 1.0f;
+}
\ No newline at end of file
diff --git a/tests/indexing/write_float.cubin b/tests/indexing/write_float.cubin
new file mode 100644
index 0000000000000000000000000000000000000000..c5f449eee66263005e3f534963cc926aff775095
GIT binary patch
literal 2856
zcmeHJ&rcIk5T5S7O2vjPc#uXD#Xy3WN>tPvjX%(XCLFop0);jy?Pj|oB_u8}@ksm=
zJn|2C@M1js7kD-vj3*8r)pfqN^V-K2O^k^q#7W-H%s2DRy!ZO{ZJ*wKFz@7YSBEGS
zhu+YR*ii9s>co<;kYAuG-6WTvMamIXI`vKzM*cEeMt3(-U+bk#y|ubUmDXmZ(OPR$
zC8}*jl_y~%sx7TG+kVsw4S=1xAJ$d};FWf(6Zx%Z=5QpeHPPfJ(bd}W<MpMr(BG(0
z1*uuUKa8M~-)ydjeo)sl48nHQj=BNUIfe@UO4Mk#{{9yHdZGVf3+!LhIy^_1R~U2Y
z)IelU<8s)n=)T64g{0<tjhj>34}Z~kp$~}V@=+Zt_4~%`LlTzB)YLY7m(&ilSKN)`
zX(}YU@pM6(Y5q0I&ni<>VY&)c(*o*ilFUi6x?G?WY-6QrEVU6*qn2&sH<W%KbF-W?
zTp)*YALUBqmP@406hY;;kk@-K(epLPzI!$ybxzR>4Wx62`JLAW{Y^AaFX;eCWps-2
z*YAjfewgonIi5S<Z1anN!|3t&=ux?gUz}x%$Hftvjq(YObAtEfgGbI5-nn>A=d=HM
zo1^m#p$7ZquFD3?1dquPn%s7DLD$(EqIc(MYLop%qCCO(=y{hMR8-|s5&VZ;kRh>4
zpFO(A{sEuQBzfl2TaU(jaZWh~zFz7dJn*x~jvuO06%Y5GEg4te#7FUOOQM${WL!GU
z#pT4<ZD>(I+}{wmxJS&*JPwcr@oY^UkVysF=|8f$m(ox+jRN7%2-U0+XW16H=YpyY
zvi(vB_XUTGuSb@T?-0*vp#LY~U*I?-+u!U1zE@m#fM1YCCC%<X4A1CS96HIP+E3Be
zc!rPufDF&nMPuygEgrT^3qP<n{HXQgdybTWZ-fA`@X?>4;h428NXzpu|1<3v?0o@@
XHuFbW@XS0Vt)~`c!;Q}Jez)Hbs$oKD

literal 0
HcmV?d00001

diff --git a/tests/indexing/write_float.ptx b/tests/indexing/write_float.ptx
new file mode 100644
index 0000000..669cf65
--- /dev/null
+++ b/tests/indexing/write_float.ptx
@@ -0,0 +1,37 @@
+//
+// Generated by NVIDIA NVVM Compiler
+//
+// Compiler Build ID: CL-34097967
+// Cuda compilation tools, release 12.4, V12.4.131
+// Based on NVVM 7.0.1
+//
+
+.version 8.4
+.target sm_80
+.address_size 64
+
+	// .globl	write_float
+
+.visible .entry write_float(
+	.param .u64 write_float_param_0,
+	.param .u64 write_float_param_1
+)
+{
+	.reg .b32 	%r<6>;
+	.reg .b64 	%rd<5>;
+
+
+	ld.param.u64 	%rd1, [write_float_param_0];
+	cvta.to.global.u64 	%rd2, %rd1;
+	mov.u32 	%r1, %ntid.x;
+	mov.u32 	%r2, %ctaid.x;
+	mov.u32 	%r3, %tid.x;
+	mad.lo.s32 	%r4, %r1, %r2, %r3;
+	mul.wide.u32 	%rd3, %r4, 4;
+	add.s64 	%rd4, %rd2, %rd3;
+	mov.u32 	%r5, 1065353216;
+	st.global.u32 	[%rd4], %r5;
+	ret;
+
+}
+