Add indexing tests

mikex86 · Sep 5, 2024 · a91732a · a91732a
1 parent d2805f0
commit a91732a
Show file tree

Hide file tree

Showing 8 changed files with 405 additions and 5 deletions.
diff --git a/driverapi/src/cmdqueue.cpp b/driverapi/src/cmdqueue.cpp
@@ -295,8 +295,6 @@ libreCudaStatus_t NvCommandQueue::signalNotify(NvSignal *pSignal, NvU32 signalTa
             LIBRECUDA_ERR_PROPAGATE(enqueue(
                     makeNvMethod(4, NVC6B5_SET_SEMAPHORE_A, 3),
                     {
-                            // little endian inside NvU32s but big endian across NvU32s for some reason...
-                            // don't question nvidia's autism...
                             U64_HI_32_BITS(pSignal),
                             U64_LO_32_BITS(pSignal),
 
@@ -417,7 +415,6 @@ libreCudaStatus_t NvCommandQueue::ensureEnoughLocalMem(LibreCUFunction function)
         LIBRECUDA_ERR_PROPAGATE(enqueue(
                 makeNvMethod(1, NVC6C0_SET_SHADER_LOCAL_MEMORY_A, 2),
                 {
-                        // weird half big and little endian along int borders again...
                         U64_HI_32_BITS(function->shader_local_memory_va),
                         U64_LO_32_BITS(function->shader_local_memory_va)
                 },
@@ -426,7 +423,6 @@ libreCudaStatus_t NvCommandQueue::ensureEnoughLocalMem(LibreCUFunction function)
         LIBRECUDA_ERR_PROPAGATE(enqueue(
                 makeNvMethod(1, NVC6C0_SET_SHADER_LOCAL_MEMORY_NON_THROTTLED_A, 3),
 
-                // weird half big and little endian along int borders again...
                 {
                         U64_HI_32_BITS(bytes_per_tpc),
                         U64_LO_32_BITS(bytes_per_tpc),

diff --git a/tests/CMakeLists.txt b/tests/CMakeLists.txt
@@ -4,4 +4,5 @@ add_subdirectory(dynamic_shared_mem)
 add_subdirectory(compute_chronological_consistency)
 add_subdirectory(test_async_kernels)
 add_subdirectory(dma_chronological_consistency)
-add_subdirectory(kernel_struct_param)
+add_subdirectory(kernel_struct_param)
+add_subdirectory(indexing)
diff --git a/tests/indexing/CMakeLists.txt b/tests/indexing/CMakeLists.txt
@@ -0,0 +1,11 @@
+add_executable(
+        test_indexing
+        main.cpp
+)
+target_link_libraries(
+        test_indexing
+        PRIVATE
+        driverapi
+)
+
+configure_file("${CMAKE_CURRENT_LIST_DIR}/write_float.cubin" ${CMAKE_BINARY_DIR}/tests/test_indexing/ COPYONLY)
diff --git a/tests/indexing/main.cpp b/tests/indexing/main.cpp
@@ -0,0 +1,130 @@
+#include <librecuda.h>
+
+#include <iostream>
+#include <iomanip>
+#include <vector>
+#include <fstream>
+#include <cstring>
+
+inline void cudaCheck(libreCudaStatus_t error, const char *file, int line) {
+    if (error != LIBRECUDA_SUCCESS) {
+        const char *error_string;
+        libreCuGetErrorString(error, &error_string);
+        printf("[CUDA ERROR] at file %s:%d: %s\n", file, line, error_string);
+        exit(EXIT_FAILURE);
+    }
+};
+#define CUDA_CHECK(err) (cudaCheck(err, __FILE__, __LINE__))
+
+int main() {
+    CUDA_CHECK(libreCuInit(0));
+
+    int device_count{};
+    CUDA_CHECK(libreCuDeviceGetCount(&device_count));
+    std::cout << "Device count: " + std::to_string(device_count) << std::endl;
+
+    LibreCUdevice device{};
+    CUDA_CHECK(libreCuDeviceGet(&device, 0));
+
+    LibreCUcontext ctx{};
+    CUDA_CHECK(libreCuCtxCreate_v2(&ctx, CU_CTX_SCHED_YIELD, device));
+
+    char name_buffer[256] = {};
+    libreCuDeviceGetName(name_buffer, 256, device);
+    std::cout << "Device Name: " + std::string(name_buffer) << std::endl;
+
+    LibreCUmodule module{};
+
+    // read cubin file
+    uint8_t *image;
+    size_t n_bytes;
+    {
+        std::ifstream input("write_float.cubin", std::ios::binary);
+        std::vector<uint8_t> bytes(
+                (std::istreambuf_iterator<char>(input)),
+                (std::istreambuf_iterator<char>()));
+        input.close();
+        image = new uint8_t[bytes.size()];
+        memcpy(image, bytes.data(), bytes.size());
+        n_bytes = bytes.size();
+    }
+    CUDA_CHECK(libreCuModuleLoadData(&module, image, n_bytes));
+
+    // read functions
+    uint32_t num_funcs{};
+    CUDA_CHECK(libreCuModuleGetFunctionCount(&num_funcs, module));
+    std::cout << "Num functions: " << num_funcs << std::endl;
+
+    auto *functions = new LibreCUFunction[num_funcs];
+    CUDA_CHECK(libreCuModuleEnumerateFunctions(functions, num_funcs, module));
+
+    for (size_t i = 0; i < num_funcs; i++) {
+        LibreCUFunction func = functions[i];
+        const char *func_name{};
+        CUDA_CHECK(libreCuFuncGetName(&func_name, func));
+        std::cout << "  function \"" << func_name << "\"" << std::endl;
+    }
+
+    delete[] functions;
+
+    // find function
+    LibreCUFunction func{};
+    CUDA_CHECK(libreCuModuleGetFunction(&func, module, "write_float"));
+
+    // set dynamic shared memory
+    CUDA_CHECK(libreCuFuncSetAttribute(func, CU_FUNC_ATTRIBUTE_MAX_DYNAMIC_SHARED_SIZE_BYTES, 8192));
+
+    // create stream
+    LibreCUstream stream{};
+    CUDA_CHECK(libreCuStreamCreate(&stream, 0));
+
+    void *float_dst_va{};
+    size_t n_elements = 50256 * 768;
+    CUDA_CHECK(libreCuMemAlloc(&float_dst_va, n_elements * sizeof(float), true));
+
+    auto *host_dst = new float[n_elements];
+
+    void *params[] = {
+            &float_dst_va, // dst
+            &n_elements
+    };
+    CUDA_CHECK(
+            libreCuLaunchKernel(func,
+                                n_elements/256, 1, 1,
+                                256, 1, 1,
+                                8192,
+                                stream,
+                                params, sizeof(params) / sizeof(void *),
+                                nullptr
+            )
+    );
+    CUDA_CHECK(libreCuMemCpy(host_dst, float_dst_va, n_elements * sizeof(float), stream, false));
+
+    // dispatch built up command buffer to GPU
+    CUDA_CHECK(libreCuStreamCommence(stream));
+
+    // wait for work to complete
+    CUDA_CHECK(libreCuStreamAwait(stream));
+
+    for (size_t i = 0; i < n_elements; i++) {
+        if (host_dst[i] != 1.0) {
+            std::cerr << "Not all values were filled!" << std::endl;
+            break;
+        }
+    }
+
+    // free memory
+    CUDA_CHECK(libreCuMemFree(float_dst_va));
+
+    delete[] host_dst;
+
+    // destroy stream
+    CUDA_CHECK(libreCuStreamDestroy(stream));
+
+    // unload module
+    CUDA_CHECK(libreCuModuleUnload(module));
+
+    // destroy ctx
+    CUDA_CHECK(libreCuCtxDestroy(ctx));
+    return 0;
+}