From c43f6d173cdc15131176ed35c7fd93fbc5e14c06 Mon Sep 17 00:00:00 2001 From: Bradley Dice Date: Mon, 18 Dec 2023 12:45:53 -0600 Subject: [PATCH 1/7] Align versions for cudnn, clang-tools, cython, and doxygen with the rest of RAPIDS. (#112) This PR aligns versions for wholegraph dependencies with versions used by other RAPIDS packages. This is needed for devcontainers, to make the unified RAPIDS conda environment solvable. See https://github.com/rapidsai/devcontainers/pull/191. Authors: - Bradley Dice (https://github.com/bdice) Approvers: - AJ Schmidt (https://github.com/ajschmidt8) - Ray Douglass (https://github.com/raydouglass) URL: https://github.com/rapidsai/wholegraph/pull/112 --- conda/environments/all_cuda-118_arch-x86_64.yaml | 10 +++++----- conda/environments/all_cuda-120_arch-x86_64.yaml | 10 +++++----- dependencies.yaml | 14 +++++++------- python/pylibwholegraph/pyproject.toml | 2 +- 4 files changed, 18 insertions(+), 18 deletions(-) diff --git a/conda/environments/all_cuda-118_arch-x86_64.yaml b/conda/environments/all_cuda-118_arch-x86_64.yaml index 292c809d4..c7410ce71 100644 --- a/conda/environments/all_cuda-118_arch-x86_64.yaml +++ b/conda/environments/all_cuda-118_arch-x86_64.yaml @@ -9,15 +9,15 @@ channels: dependencies: - breathe - c-compiler -- clang-tools=16.0.0 -- clangxx=16.0.0 +- clang-tools==16.0.6 +- clangxx==16.0.6 - cmake>=3.26.4 - cuda-nvtx=11.8 - cudatoolkit=11.8 -- cudnn=8.4 +- cudnn=8.8 - cxx-compiler -- cython -- doxygen=1.8.20 +- cython>=3.0.0 +- doxygen==1.9.1 - gcc_linux-64=11.* - gitpython - graphviz diff --git a/conda/environments/all_cuda-120_arch-x86_64.yaml b/conda/environments/all_cuda-120_arch-x86_64.yaml index b436d5641..568c312b4 100644 --- a/conda/environments/all_cuda-120_arch-x86_64.yaml +++ b/conda/environments/all_cuda-120_arch-x86_64.yaml @@ -9,17 +9,17 @@ channels: dependencies: - breathe - c-compiler -- clang-tools=16.0.0 -- clangxx=16.0.0 +- clang-tools==16.0.6 +- clangxx==16.0.6 - cmake>=3.26.4 - cuda-cudart-dev - cuda-nvcc - cuda-nvtx - cuda-version=12.0 -- cudnn=8.4 +- cudnn=8.8 - cxx-compiler -- cython -- doxygen=1.8.20 +- cython>=3.0.0 +- doxygen==1.9.1 - gcc_linux-64=11.* - gitpython - graphviz diff --git a/dependencies.yaml b/dependencies.yaml index 71c0dc93d..2a52b450f 100644 --- a/dependencies.yaml +++ b/dependencies.yaml @@ -68,10 +68,10 @@ dependencies: packages: - c-compiler - cmake>=3.26.4 - - cudnn=8.4 + - cudnn=8.8 - cxx-compiler - - cython - - doxygen=1.8.20 + - cython>=3.0.0 + - &doxygen doxygen==1.9.1 - libraft-headers==24.2.* - librmm==24.2.* - nanobind>=0.2.0 @@ -252,7 +252,7 @@ dependencies: - output_types: [conda, requirements] packages: - breathe - - doxygen=1.8.20 + - *doxygen - graphviz - ipython - ipykernel @@ -274,15 +274,15 @@ dependencies: common: - output_types: [conda, requirements] packages: - - clangxx=16.0.0 - - clang-tools=16.0.0 + - clangxx==16.0.6 + - clang-tools==16.0.6 - gitpython python_build_wheel: common: - output_types: [pyproject] packages: - cmake>=3.26.4 - - cython>=0.29,<0.30 + - cython>=3.0.0 - ninja - setuptools - scikit-build>=0.13.1 diff --git a/python/pylibwholegraph/pyproject.toml b/python/pylibwholegraph/pyproject.toml index ccb14b831..910d4ccea 100644 --- a/python/pylibwholegraph/pyproject.toml +++ b/python/pylibwholegraph/pyproject.toml @@ -15,7 +15,7 @@ [build-system] requires = [ "cmake>=3.26.4", - "cython>=0.29,<0.30", + "cython>=3.0.0", "ninja", "scikit-build>=0.13.1", "setuptools", From ef5b3eef3f89dc2c9536fe9abce2d0c93cc9a364 Mon Sep 17 00:00:00 2001 From: Vyas Ramasubramani Date: Tue, 19 Dec 2023 09:28:02 -0800 Subject: [PATCH 2/7] Don't overwrite wholegraph_ROOT if provided (#114) This change allows standard CMake specification of the C++ package directory (via `-Dwholegraph_ROOT`) to also work during the Python build. Authors: - Vyas Ramasubramani (https://github.com/vyasr) Approvers: - Bradley Dice (https://github.com/bdice) - Ray Douglass (https://github.com/raydouglass) URL: https://github.com/rapidsai/wholegraph/pull/114 --- python/pylibwholegraph/CMakeLists.txt | 4 +++- 1 file changed, 3 insertions(+), 1 deletion(-) diff --git a/python/pylibwholegraph/CMakeLists.txt b/python/pylibwholegraph/CMakeLists.txt index a8cd320e0..758fe3713 100644 --- a/python/pylibwholegraph/CMakeLists.txt +++ b/python/pylibwholegraph/CMakeLists.txt @@ -113,7 +113,9 @@ include(../../cpp/cmake/thirdparty/get_raft.cmake) #include(${CMAKE_CURRENT_LIST_DIR}/../cmake/thirdparty/nanobind.cmake) # use _ROOT here to take precedence over any other package -set(wholegraph_ROOT "$ENV{LIBWHOLEGRAPH_DIR}") +if (DEFINED ENV{LIBWHOLEGRAPH_DIR}) + set(wholegraph_ROOT "$ENV{LIBWHOLEGRAPH_DIR}") +endif() find_package(wholegraph "${RAPIDS_VERSION}.0" EXACT) message("WholeGraph") if (WHOLEGRAPH_FOUND) From aaceac529dc3a87286fd1cb2a435f42d3404d735 Mon Sep 17 00:00:00 2001 From: dongxuy04 <78518666+dongxuy04@users.noreply.github.com> Date: Tue, 9 Jan 2024 23:16:01 +0800 Subject: [PATCH 3/7] added Direct IO support for WholeMemory loading (#113) Add Direct IO support option for WholeMemory loading from disk. Using Direct IO may be faster on some high performance file systems. Authors: - https://github.com/dongxuy04 - Brad Rees (https://github.com/BradReesWork) Approvers: - Brad Rees (https://github.com/BradReesWork) URL: https://github.com/rapidsai/wholegraph/pull/113 --- cpp/src/wholememory/file_io.cpp | 469 +++++++++++++++++++++++++------- 1 file changed, 372 insertions(+), 97 deletions(-) diff --git a/cpp/src/wholememory/file_io.cpp b/cpp/src/wholememory/file_io.cpp index 3274811e1..0540a3f5d 100644 --- a/cpp/src/wholememory/file_io.cpp +++ b/cpp/src/wholememory/file_io.cpp @@ -15,14 +15,17 @@ */ #include "file_io.h" +#include #include #include +#include #include #include #include "communicator.hpp" #include "error.hpp" +#include "integer_utils.hpp" #include "logger.hpp" namespace wholememory { @@ -38,6 +41,15 @@ static size_t StatFileSize(const char* filename) return filesize; } +static size_t StatFileBlockSize(const char* filename) +{ + auto blocksize = static_cast(-1); + struct stat statbuf {}; + if (stat(filename, &statbuf) < 0) { return blocksize; } + blocksize = statbuf.st_blksize; + return blocksize; +} + static size_t get_handle_partial_size(size_t handle_size, size_t memory_offset, size_t memory_entry_stride, @@ -62,6 +74,317 @@ static size_t get_handle_partial_size(size_t handle_size, return partial_size; } +/*! + * Read from file list to local memory of WholeMemory. File list are binary files, which are + * considered to be concatenated together. All ranks in WholeMemory will read the files in parallel + * and load each part into local memory of each rank. + * @param local_ptr : Pointer to local memory of WholeMemory + * @param local_size : Local memory size + * @param local_offset : The offset of local memory in WholeMemory. + * @param entry_size : The entry size of each data entry. + * @param memory_entry_stride : The stride of each entry in WholeMemory + * @param memory_offset : The start offset to place the read data. Should be in range [0, + * memory_entry_stride) + * @param file_count : Total file count of the file list + * @param file_names : File names of the file list. + * @param file_sizes : Sizes of each file. + * @param suggested_buffer_size : Suggested buffer size to read. + * @param wm_rank : WholeMemory rank. + */ +static void read_file_list_to_local_memory(char* local_ptr, + size_t local_size, + size_t local_offset, + size_t entry_size, + size_t memory_entry_stride, + size_t memory_offset, + int file_count, + const char** file_names, + const std::vector& file_sizes, + size_t suggested_buffer_size, + int wm_rank) +{ + size_t buffer_size; + size_t buffer_entry_count = 1; + if (suggested_buffer_size < entry_size) { + buffer_size = entry_size; + } else { + buffer_entry_count = suggested_buffer_size / entry_size; + buffer_size = buffer_entry_count * entry_size; + } + std::vector file_read_buffer(buffer_size); + + size_t local_entry_memory_start_index = local_offset / memory_entry_stride; + size_t local_entry_file_start_index = + local_entry_memory_start_index - memory_offset / memory_entry_stride; + size_t local_entry_count = local_size / memory_entry_stride; + char* local_write_ptr = local_ptr + memory_offset % memory_entry_stride; + if (wm_rank == 0) { + local_entry_count -= memory_offset / memory_entry_stride; + local_write_ptr += (memory_offset / memory_entry_stride) * memory_entry_stride; + } + size_t local_entry_idx = 0; + + size_t file_entry_offset = 0; + size_t total_read_bytes = 0; + for (int i = 0; i < file_count; i++) { + size_t file_entry_count = file_sizes[i] / entry_size; + // already outside reading window + if (file_entry_offset >= local_entry_file_start_index + local_entry_count) break; + // in reading window + if (file_entry_offset + file_entry_count > local_entry_file_start_index) { + size_t file_read_start_offset = 0; + FILE* fp = fopen(file_names[i], "rb"); + if (fp == nullptr) { WHOLEMEMORY_ERROR("Open file %s for read failed.", file_names[i]); } + // maybe in window end, remove possible tailing data that don't belong to current rank. + size_t to_read_file_entry_count = std::min( + file_entry_count, local_entry_file_start_index + local_entry_count - file_entry_offset); + // if in window begin, remove possible data that belongs to previous rank and skip disk + // data. + if (file_entry_offset < local_entry_file_start_index) { + size_t skip_entry_count = local_entry_file_start_index - file_entry_offset; + + file_read_start_offset = skip_entry_count * entry_size; + + if (fseeko(fp, file_read_start_offset, SEEK_SET) != 0) { + WHOLEMEMORY_ERROR( + "File %s seek to %ld failed.", file_names[i], skip_entry_count * entry_size); + } + to_read_file_entry_count -= skip_entry_count; + } + // now all data in file_entry_count need to be read. + size_t bytes_to_read = to_read_file_entry_count * entry_size; + size_t left_entry_count = to_read_file_entry_count; + while (left_entry_count > 0) { + size_t read_entry_count = std::min(left_entry_count, buffer_entry_count); + + int ret = fread(file_read_buffer.data(), entry_size, read_entry_count, fp); + if (ret != read_entry_count) { + WHOLEMEMORY_ERROR( + "File %s line %d: reading from file %s, read_entry_count=%ld, entry_size=%ld, " + "returned %d, error=%s\n", + __FILE__, + __LINE__, + file_names[i], + read_entry_count, + entry_size, + ret, + strerror(errno)); + } + + if (entry_size != memory_entry_stride) { + WM_CUDA_CHECK(cudaMemcpy2D(local_write_ptr, + memory_entry_stride, + file_read_buffer.data(), + entry_size, + entry_size, + read_entry_count, + cudaMemcpyDefault)); + } else { + WM_CUDA_CHECK(cudaMemcpy(local_write_ptr, + file_read_buffer.data(), + read_entry_count * entry_size, + cudaMemcpyDefault)); + } + local_write_ptr += read_entry_count * memory_entry_stride; + + left_entry_count -= read_entry_count; + } + fclose(fp); + WHOLEMEMORY_INFO( + "Rank=%d done Reading %ld bytes from file %s size=%ld, starting from offset=%ld.", + wm_rank, + bytes_to_read, + file_names[i], + file_sizes[i], + file_read_start_offset); + total_read_bytes += bytes_to_read; + } + file_entry_offset += file_entry_count; + } + WHOLEMEMORY_INFO( + "Rank=%d done reading total %ld bytes from needed files.", wm_rank, total_read_bytes); +} + +/*! + * Read from file list to local memory of WholeMemory using DirectIO. Using DirectIO may have better + * performance by bypassing system cache if it is bottleneck. File list are binary files, which are + * considered to be concatenated together. All ranks in WholeMemory will read the files in parallel + * and load each part into local memory of each rank. + * @param local_ptr : Pointer to local memory of WholeMemory + * @param local_size : Local memory size + * @param local_offset : The offset of local memory in WholeMemory. + * @param entry_size : The entry size of each data entry. + * @param memory_entry_stride : The stride of each entry in WholeMemory + * @param memory_offset : The start offset to place the read data. Should be in range [0, + * memory_entry_stride) + * @param file_count : Total file count of the file list + * @param file_names : File names of the file list. + * @param file_sizes : Sizes of each file. + * @param suggested_buffer_size : Suggested buffer size to read. + * @param wm_rank : WholeMemory rank. + */ +static void read_file_list_to_local_memory_directio(char* local_ptr, + size_t local_size, + size_t local_offset, + size_t entry_size, + size_t memory_entry_stride, + size_t memory_offset, + int file_count, + const char** file_names, + const std::vector& file_sizes, + size_t suggested_buffer_size, + int wm_rank) +{ + if (memory_offset + entry_size > memory_entry_stride) { + WHOLEMEMORY_FAIL_NOTHROW("Direct io mode only support reading all entries."); + } + size_t local_entry_start_index = local_offset / memory_entry_stride; + size_t local_entry_count = local_size / memory_entry_stride; + char* local_write_ptr = local_ptr + memory_offset % memory_entry_stride; + + static size_t kAlignSize = 16 * 1024 * 1024; + suggested_buffer_size = round_up_unsafe(suggested_buffer_size, kAlignSize); + + char* block_buffer; + WHOLEMEMORY_CHECK_NOTHROW(posix_memalign(reinterpret_cast(&block_buffer), + kAlignSize, + suggested_buffer_size) == 0); + + size_t file_entry_offset = 0; + size_t read_entry_count = 0; + for (int i = 0; i < file_count; i++) { + size_t file_entry_count = file_sizes[i] / entry_size; + // already outside reading window + if (file_entry_offset >= local_entry_start_index + local_entry_count) break; + // reading window not reached + if (file_entry_offset + file_entry_count <= local_entry_start_index) { + file_entry_offset += file_entry_count; + continue; + } + // in reading window + auto block_size = StatFileBlockSize(file_names[i]); + if (block_size == 0 || block_size == (size_t)-1 || kAlignSize % block_size != 0) { + WHOLEMEMORY_FAIL_NOTHROW( + "block_size=%ld for file %s, but alignment is %ld", block_size, file_names[i], kAlignSize); + } + size_t buffer_block_count = suggested_buffer_size / block_size; + int fd = open(file_names[i], O_DIRECT | O_RDONLY); + if (fd < 0) { WHOLEMEMORY_FAIL_NOTHROW("Open file %s with direct io failed.", file_names[i]); } + + // maybe in window end, remove possible tailing data that don't belong to current rank. + size_t to_read_file_entry_count = + std::min(file_entry_count, local_entry_start_index + local_entry_count - file_entry_offset); + + size_t file_read_end = to_read_file_entry_count * entry_size; + // if in window begin, remove possible data that belongs to previous rank and skip disk + // data. + size_t file_read_start = 0; + if (file_entry_offset < local_entry_start_index) { + size_t skip_entry_count = local_entry_start_index - file_entry_offset; + to_read_file_entry_count -= skip_entry_count; + file_read_start = skip_entry_count * entry_size; + } + + size_t file_block_read_offset = file_read_start / block_size * block_size; + size_t skip_head_size = file_read_start - file_block_read_offset; + + char* local_mem_write_entry_for_file = local_write_ptr + read_entry_count * memory_entry_stride; + size_t first_mem_entry_offset = 0; + size_t useful_data_bytes_read = 0; + size_t physical_data_bytes_read = 0; + while (file_block_read_offset < file_read_end) { + size_t left_size = file_read_end - file_block_read_offset; + size_t left_block_count = div_rounding_up_unsafe(left_size, block_size); + size_t read_block_count = std::min(left_block_count, buffer_block_count); + size_t physical_read_size = read_block_count * block_size; + physical_data_bytes_read += physical_read_size; + + ssize_t pread_size = pread64(fd, block_buffer, physical_read_size, file_block_read_offset); + if (pread_size != physical_read_size && + file_block_read_offset + pread_size != file_sizes[i]) { + WHOLEMEMORY_FAIL_NOTHROW( + "rank=%d, pread_size=%ld, physical_read_size=%ld, file_block_read_offset=%ld, " + "file_sizes[i]=%ld, file=%s", + wm_rank, + pread_size, + physical_read_size, + file_block_read_offset, + file_sizes[i], + file_names[i]); + } + + size_t drop_tail_size = 0; + if (file_block_read_offset + physical_read_size > file_read_end) { + drop_tail_size = file_block_read_offset + physical_read_size - file_read_end; + } + + char* useful_data_ptr = block_buffer + skip_head_size; + size_t useful_data_size = physical_read_size - skip_head_size - drop_tail_size; + + useful_data_bytes_read += useful_data_size; + + if (first_mem_entry_offset != 0) { + // process head + size_t entry_left_size = entry_size - first_mem_entry_offset; + WM_CUDA_CHECK_NO_THROW(cudaMemcpy(local_mem_write_entry_for_file + first_mem_entry_offset, + useful_data_ptr, + entry_left_size, + cudaMemcpyDefault)); + local_mem_write_entry_for_file += memory_entry_stride; + useful_data_ptr += entry_left_size; + useful_data_size -= entry_left_size; + entry_left_size = 0; + } + + size_t full_entry_count = useful_data_size / entry_size; + size_t full_entry_size = full_entry_count * entry_size; + + if (full_entry_size > 0) { + if (entry_size != memory_entry_stride) { + WM_CUDA_CHECK(cudaMemcpy2D(local_mem_write_entry_for_file, + memory_entry_stride, + useful_data_ptr, + entry_size, + entry_size, + full_entry_count, + cudaMemcpyDefault)); + } else { + WM_CUDA_CHECK(cudaMemcpy( + local_mem_write_entry_for_file, useful_data_ptr, full_entry_size, cudaMemcpyDefault)); + } + local_mem_write_entry_for_file += memory_entry_stride * full_entry_count; + useful_data_ptr += full_entry_size; + useful_data_size -= full_entry_size; + } + + size_t tail_entry_size = useful_data_size % entry_size; + if (tail_entry_size != 0) { + // process tail + WM_CUDA_CHECK_NO_THROW(cudaMemcpy( + local_mem_write_entry_for_file, useful_data_ptr, tail_entry_size, cudaMemcpyDefault)); + first_mem_entry_offset = tail_entry_size; + } + + file_block_read_offset += physical_read_size; + skip_head_size = 0; + } + + WHOLEMEMORY_INFO( + "Rank=%d done Reading %ld useful bytes by reading %ld block bytes using DirectIO from file " + "%s size=%ld.", + wm_rank, + useful_data_bytes_read, + physical_data_bytes_read, + file_names[i], + file_sizes[i]); + + close(fd); + file_entry_offset += file_entry_count; + read_entry_count += to_read_file_entry_count; + } + free(block_buffer); +} + wholememory_error_code_t load_file_to_handle(wholememory_handle_t wholememory_handle, size_t memory_offset, size_t memory_entry_stride, @@ -153,107 +476,59 @@ wholememory_error_code_t load_file_to_handle(wholememory_handle_t wholememory_ha (void**)(&local_ptr), &local_size, &local_offset, wholememory_handle) == WHOLEMEMORY_SUCCESS); - constexpr int kSuggestedBufferSize = 16 * 1024 * 1024; - size_t buffer_size; - size_t buffer_entry_count = 1; - if (kSuggestedBufferSize < entry_size) { - buffer_size = entry_size; - } else { - buffer_entry_count = kSuggestedBufferSize / entry_size; - buffer_size = buffer_entry_count * entry_size; + int suggested_buffer_size_mb = 16; + const char* buffer_size_env_var = std::getenv("WG_LOAD_BUFFER_SIZE_MB"); + if (buffer_size_env_var != nullptr) { + try { + suggested_buffer_size_mb = std::stoi(buffer_size_env_var); + } catch (const std::invalid_argument& e) { + suggested_buffer_size_mb = 16; + WHOLEMEMORY_WARN( + "Environment variable WG_LOAD_BUFFER_SIZE_MB value %s is not valid, using default %d", + buffer_size_env_var, + suggested_buffer_size_mb); + } + if (suggested_buffer_size_mb < 1) { + suggested_buffer_size_mb = 16; + WHOLEMEMORY_WARN( + "Environment variable WG_LOAD_BUFFER_SIZE_MB value %s is not valid, using default %d", + buffer_size_env_var, + suggested_buffer_size_mb); + } } - std::vector file_read_buffer(buffer_size); + size_t suggested_buffer_size = static_cast(suggested_buffer_size_mb) * 1024 * 1024; - size_t local_entry_memory_start_index = local_offset / memory_entry_stride; - size_t local_entry_file_start_index = - local_entry_memory_start_index - memory_offset / memory_entry_stride; - size_t local_entry_count = local_size / memory_entry_stride; - char* local_write_ptr = local_ptr + memory_offset % memory_entry_stride; - if (wm_rank == 0) { - local_entry_count -= memory_offset / memory_entry_stride; - local_write_ptr += (memory_offset / memory_entry_stride) * memory_entry_stride; + const char* directio_env_var = std::getenv("WG_LOAD_USE_DIRECTIO"); + bool use_direct_io = false; + if (directio_env_var != nullptr && directio_env_var[0] == '1' && directio_env_var[1] == '\0') { + use_direct_io = true; } - size_t local_entry_idx = 0; - - size_t file_entry_offset = 0; - size_t total_read_bytes = 0; - for (int i = 0; i < file_count; i++) { - size_t file_entry_count = file_sizes[i] / entry_size; - // already outside reading window - if (file_entry_offset >= local_entry_file_start_index + local_entry_count) break; - // in reading window - if (file_entry_offset + file_entry_count > local_entry_file_start_index) { - size_t file_read_start_offset = 0; - FILE* fp = fopen(file_names[i], "rb"); - if (fp == nullptr) { WHOLEMEMORY_ERROR("Open file %s for read failed.", file_names[i]); } - // maybe in window end, remove possible tailing data that don't belong to current rank. - size_t to_read_file_entry_count = std::min( - file_entry_count, local_entry_file_start_index + local_entry_count - file_entry_offset); - // if in window begin, remove possible data that belongs to previous rank and skip disk - // data. - if (file_entry_offset < local_entry_file_start_index) { - size_t skip_entry_count = local_entry_file_start_index - file_entry_offset; - - file_read_start_offset = skip_entry_count * entry_size; - - if (fseeko(fp, file_read_start_offset, SEEK_SET) != 0) { - WHOLEMEMORY_ERROR( - "File %s seek to %ld failed.", file_names[i], skip_entry_count * entry_size); - } - to_read_file_entry_count -= skip_entry_count; - } - // now all data in file_entry_count need to be read. - size_t bytes_to_read = to_read_file_entry_count * entry_size; - size_t left_entry_count = to_read_file_entry_count; - while (left_entry_count > 0) { - size_t read_entry_count = std::min(left_entry_count, buffer_entry_count); - - int ret = fread(file_read_buffer.data(), entry_size, read_entry_count, fp); - if (ret != read_entry_count) { - WHOLEMEMORY_ERROR( - "File %s line %d: reading from file %s, read_entry_count=%ld, entry_size=%ld, " - "returned %d, error=%s\n", - __FILE__, - __LINE__, - file_names[i], - read_entry_count, - entry_size, - ret, - strerror(errno)); - } - - if (entry_size != memory_entry_stride) { - WM_CUDA_CHECK(cudaMemcpy2D(local_write_ptr, - memory_entry_stride, - file_read_buffer.data(), - entry_size, - entry_size, - read_entry_count, - cudaMemcpyDefault)); - } else { - WM_CUDA_CHECK(cudaMemcpy(local_write_ptr, - file_read_buffer.data(), - read_entry_count * entry_size, - cudaMemcpyDefault)); - } - local_write_ptr += read_entry_count * memory_entry_stride; - - left_entry_count -= read_entry_count; - } - fclose(fp); - WHOLEMEMORY_INFO( - "Rank=%d done Reading %ld bytes from file %s size=%ld, starting from offset=%ld.", - wm_rank, - bytes_to_read, - file_names[i], - file_sizes[i], - file_read_start_offset); - total_read_bytes += bytes_to_read; - } - file_entry_offset += file_entry_count; + if (!use_direct_io) { + read_file_list_to_local_memory(local_ptr, + local_size, + local_offset, + entry_size, + memory_entry_stride, + memory_offset, + file_count, + file_names, + file_sizes, + suggested_buffer_size, + wm_rank); + } else { + read_file_list_to_local_memory_directio(local_ptr, + local_size, + local_offset, + entry_size, + memory_entry_stride, + memory_offset, + file_count, + file_names, + file_sizes, + suggested_buffer_size, + wm_rank); } - WHOLEMEMORY_INFO( - "Rank=%d done reading total %ld bytes from needed files.", wm_rank, total_read_bytes); + wm_comm->barrier(); } catch (wholememory::logic_error& wle) { WHOLEMEMORY_ERROR("Logic error: %s", wle.what()); From 7025eafa75567f7353ae97b5f9077ef9f9879649 Mon Sep 17 00:00:00 2001 From: James Lamb Date: Thu, 11 Jan 2024 13:54:01 -0600 Subject: [PATCH 4/7] refactor CUDA versions in dependencies.yaml (#115) Contributes to https://github.com/rapidsai/build-planning/issues/7. Proposes splitting the `cuda-version` dependency in `dependencies.yaml` out to its own thing, separate from the bits of the CUDA Toolkit this project needs. ### Benefits of this change * prevents accidental inclusion of multiple `cuda-version` version in environments * reduces update effort (via enabling more use of globs like `"12.*"`) * improves the chance that errors like "`conda` recipe is missing a dependency" are caught in CI ### Notes for Reviewers This change was intended to just re-organize `dependencies.yaml`, but I do think the one additional change it introduces to `all_cuda-118_arch-x86_64.yaml` is a good one. I *think* requiring the `cuda-version` metapackage in all environments is useful to prevent against environment solves that result in runtime issues. References: * https://github.com/conda-forge/cuda-version-feedstock/blob/902045016fbc6e4dd1350a7390b0411f376d1a19/recipe/meta.yaml#L13-L18 * https://docs.conda.io/projects/conda-build/en/stable/resources/define-metadata.html#run-constrained Authors: - James Lamb (https://github.com/jameslamb) Approvers: - Ray Douglass (https://github.com/raydouglass) - Bradley Dice (https://github.com/bdice) URL: https://github.com/rapidsai/wholegraph/pull/115 --- .pre-commit-config.yaml | 2 +- .../all_cuda-118_arch-x86_64.yaml | 3 +- dependencies.yaml | 50 ++++++++++++++----- 3 files changed, 40 insertions(+), 15 deletions(-) diff --git a/.pre-commit-config.yaml b/.pre-commit-config.yaml index 6943ae3b0..eef7a0285 100644 --- a/.pre-commit-config.yaml +++ b/.pre-commit-config.yaml @@ -40,7 +40,7 @@ repos: pass_filenames: false additional_dependencies: [gitpython] - repo: https://github.com/rapidsai/dependency-file-generator - rev: v1.5.1 + rev: v1.8.0 hooks: - id: rapids-dependency-file-generator args: ["--clean"] diff --git a/conda/environments/all_cuda-118_arch-x86_64.yaml b/conda/environments/all_cuda-118_arch-x86_64.yaml index c7410ce71..825ec1f7d 100644 --- a/conda/environments/all_cuda-118_arch-x86_64.yaml +++ b/conda/environments/all_cuda-118_arch-x86_64.yaml @@ -13,7 +13,8 @@ dependencies: - clangxx==16.0.6 - cmake>=3.26.4 - cuda-nvtx=11.8 -- cudatoolkit=11.8 +- cuda-version=11.8 +- cudatoolkit - cudnn=8.8 - cxx-compiler - cython>=3.0.0 diff --git a/dependencies.yaml b/dependencies.yaml index 2a52b450f..17ed61598 100644 --- a/dependencies.yaml +++ b/dependencies.yaml @@ -8,7 +8,8 @@ files: includes: - checks - build - - cudatoolkit + - cuda + - cuda_version - py_version - run - test_python @@ -17,11 +18,11 @@ files: test_cpp: output: none includes: - - cudatoolkit + - cuda_version test_python: output: none includes: - - cudatoolkit + - cuda_version - py_version - test_python checks: @@ -32,7 +33,7 @@ files: docs: output: none includes: - - cudatoolkit + - cuda_version - docs - py_version - pytorch_cpu @@ -40,7 +41,8 @@ files: output: none includes: - build - - cudatoolkit + - cuda + - cuda_version - py_version - run - pytorch_cpu @@ -107,39 +109,61 @@ dependencies: cuda: "11.8" packages: - nvcc_linux-aarch64=11.8 + - matrix: + cuda: "12.*" + packages: + - cuda-nvcc + cuda_version: + specific: + - output_types: conda + matrices: + - matrix: + cuda: "11.2" + packages: + - cuda-version=11.2 + - matrix: + cuda: "11.4" + packages: + - cuda-version=11.4 + - matrix: + cuda: "11.5" + packages: + - cuda-version=11.5 + - matrix: + cuda: "11.8" + packages: + - cuda-version=11.8 - matrix: cuda: "12.0" packages: - cuda-version=12.0 - - cuda-nvcc - cudatoolkit: + cuda: specific: - output_types: conda matrices: - matrix: cuda: "11.2" packages: - - cudatoolkit=11.2 + - cudatoolkit - cuda-nvtx=11.4 # oldest available - matrix: cuda: "11.4" packages: - - cudatoolkit=11.4 + - cudatoolkit - cuda-nvtx=11.4 # oldest available - matrix: cuda: "11.5" packages: - - cudatoolkit=11.5 + - cudatoolkit - cuda-nvtx=11.5 - matrix: cuda: "11.8" packages: - - cudatoolkit=11.8 + - cudatoolkit - cuda-nvtx=11.8 - matrix: - cuda: "12.0" + cuda: "12.*" packages: - - cuda-version=12.0 - cuda-cudart-dev - cuda-nvtx checks: From 0ddab62f9e3c44a51cc9755ce3a7b10caef6fb40 Mon Sep 17 00:00:00 2001 From: Kyle Edwards Date: Fri, 12 Jan 2024 11:55:40 -0500 Subject: [PATCH 5/7] Remove usages of rapids-env-update (#117) Reference: https://github.com/rapidsai/ops/issues/2766 Replace rapids-env-update with rapids-configure-conda-channels, rapids-configure-sccache, and rapids-date-string. Authors: - Kyle Edwards (https://github.com/KyleFromNVIDIA) Approvers: - AJ Schmidt (https://github.com/ajschmidt8) URL: https://github.com/rapidsai/wholegraph/pull/117 --- ci/build_cpp.sh | 6 +++++- ci/build_python.sh | 6 +++++- 2 files changed, 10 insertions(+), 2 deletions(-) diff --git a/ci/build_cpp.sh b/ci/build_cpp.sh index 4e1b7bd2a..bd45a3f57 100755 --- a/ci/build_cpp.sh +++ b/ci/build_cpp.sh @@ -3,7 +3,11 @@ set -euo pipefail -source rapids-env-update +rapids-configure-conda-channels + +source rapids-configure-sccache + +source rapids-date-string export CMAKE_GENERATOR=Ninja diff --git a/ci/build_python.sh b/ci/build_python.sh index b79ba92b1..efb7bfe4a 100755 --- a/ci/build_python.sh +++ b/ci/build_python.sh @@ -3,7 +3,11 @@ set -euo pipefail -source rapids-env-update +rapids-configure-conda-channels + +source rapids-configure-sccache + +source rapids-date-string export CMAKE_GENERATOR=Ninja From 4a92d47c9580ede6f475f8ee382342013d08f092 Mon Sep 17 00:00:00 2001 From: Chuang Zhu <111838961+chuangz0@users.noreply.github.com> Date: Fri, 19 Jan 2024 23:37:03 +0800 Subject: [PATCH 6/7] fix inferencesample option (#107) fix inferencesample option Authors: - Chuang Zhu (https://github.com/chuangz0) Approvers: - Brad Rees (https://github.com/BradReesWork) URL: https://github.com/rapidsai/wholegraph/pull/107 --- cpp/src/wholememory_ops/functions/nvshmem_device_reference.cuh | 2 +- cpp/src/wholememory_ops/gather_op_impl_nvshmem.cu | 1 + python/pylibwholegraph/pylibwholegraph/torch/common_options.py | 2 +- 3 files changed, 3 insertions(+), 2 deletions(-) diff --git a/cpp/src/wholememory_ops/functions/nvshmem_device_reference.cuh b/cpp/src/wholememory_ops/functions/nvshmem_device_reference.cuh index d2d040a0e..5fa93ee12 100644 --- a/cpp/src/wholememory_ops/functions/nvshmem_device_reference.cuh +++ b/cpp/src/wholememory_ops/functions/nvshmem_device_reference.cuh @@ -29,7 +29,7 @@ class nvshmem_device_reference { : pointer_(static_cast(nvshmem_ref.pointer)), typed_stride_(nvshmem_ref.stride / sizeof(DataTypeT)) { - assert(gref.stride % sizeof(DataTypeT) == 0); + assert(nvshmem_ref.stride % sizeof(DataTypeT) == 0); } __device__ nvshmem_device_reference() = delete; diff --git a/cpp/src/wholememory_ops/gather_op_impl_nvshmem.cu b/cpp/src/wholememory_ops/gather_op_impl_nvshmem.cu index a860cbc6c..4051f12bd 100644 --- a/cpp/src/wholememory_ops/gather_op_impl_nvshmem.cu +++ b/cpp/src/wholememory_ops/gather_op_impl_nvshmem.cu @@ -185,6 +185,7 @@ wholememory_error_code_t wholememory_gather_nvshmem( p_env_fns, stream); // ungistre + WM_CUDA_CHECK(cudaStreamSynchronize(stream)); if (nvshmemx_buffer_unregister(temp_output_ptr) != 0) { WHOLEMEMORY_ERROR("nvshmemx_buffer_unregister error in wholememory_gather_nvshmem"); } diff --git a/python/pylibwholegraph/pylibwholegraph/torch/common_options.py b/python/pylibwholegraph/pylibwholegraph/torch/common_options.py index 0999fdfe5..42746add8 100644 --- a/python/pylibwholegraph/pylibwholegraph/torch/common_options.py +++ b/python/pylibwholegraph/pylibwholegraph/torch/common_options.py @@ -132,7 +132,7 @@ def add_common_sampler_options(argparser: ArgumentParser): argparser.add_argument( "-s", "--inferencesample", - type=int, + type=str, dest="inferencesample", default="30", help="inference sample count, -1 is all", From ec609abe85760bc523c42e0e4d64ac4616181363 Mon Sep 17 00:00:00 2001 From: linhu-nv <141609318+linhu-nv@users.noreply.github.com> Date: Fri, 19 Jan 2024 23:42:29 +0800 Subject: [PATCH 7/7] fix a bug for embedding optimizer, which leads to undefined behavior (#108) Fix a bug for embedding optimizer, it leads to undefined behavior when embedding_dim is not multiple of 32. Authors: - https://github.com/linhu-nv Approvers: - Brad Rees (https://github.com/BradReesWork) URL: https://github.com/rapidsai/wholegraph/pull/108 --- .../functions/embedding_optimizer_func.cu | 12 ++++++++---- .../wholememory_embedding_gradient_apply_tests.cu | 8 +++++--- 2 files changed, 13 insertions(+), 7 deletions(-) diff --git a/cpp/src/wholememory_ops/functions/embedding_optimizer_func.cu b/cpp/src/wholememory_ops/functions/embedding_optimizer_func.cu index e6d751280..0249ba1ac 100644 --- a/cpp/src/wholememory_ops/functions/embedding_optimizer_func.cu +++ b/cpp/src/wholememory_ops/functions/embedding_optimizer_func.cu @@ -214,7 +214,8 @@ __global__ void sgd_optimizer_step_kernel(const IndiceT* indices_ptr, int local_dim_idx = threadIdx.x; float grad_value = 0.0f; int embedding_idx = local_dim_idx + loop_start_idx; - if (embedding_idx < embedding_dim) { grad_value = grads_ptr[embedding_idx]; } + if (embedding_idx >= embedding_dim) { break; } + grad_value = grads_ptr[embedding_idx]; float embedding_value = embedding_ptr[embedding_idx]; grad_value += weight_decay * embedding_value; embedding_value -= lr * grad_value; @@ -392,7 +393,8 @@ __global__ void lazy_adam_optimizer_step_kernel(const IndiceT* indices_ptr, int local_dim_idx = threadIdx.x; float grad_value = 0.0f; int embedding_idx = local_dim_idx + loop_start_idx; - if (embedding_idx < embedding_dim) { grad_value = grads_ptr[local_dim_idx + loop_start_idx]; } + if (embedding_idx >= embedding_dim) { break; } + grad_value = grads_ptr[local_dim_idx + loop_start_idx]; float embedding_value = embedding_ptr[embedding_idx]; if (AdamW) { embedding_value -= lr * weight_decay * embedding_value; @@ -644,7 +646,8 @@ __global__ void ada_grad_optimizer_step_kernel(const IndiceT* indices_ptr, int local_dim_idx = threadIdx.x; float grad_value = 0.0f; int embedding_idx = local_dim_idx + loop_start_idx; - if (embedding_idx < embedding_dim) { grad_value = grads_ptr[embedding_idx]; } + if (embedding_idx >= embedding_dim) { break; } + grad_value = grads_ptr[embedding_idx]; float embedding_value = embedding_ptr[embedding_idx]; grad_value = grad_value + weight_decay * embedding_value; float state_sum = state_sum_ptr[embedding_idx]; @@ -841,7 +844,8 @@ __global__ void rms_prop_optimizer_step_kernel(const IndiceT* indices_ptr, int local_dim_idx = threadIdx.x; float grad_value = 0.0f; int embedding_idx = local_dim_idx + loop_start_idx; - if (embedding_idx < embedding_dim) { grad_value = grads_ptr[local_dim_idx + loop_start_idx]; } + if (embedding_idx >= embedding_dim) { break; } + grad_value = grads_ptr[local_dim_idx + loop_start_idx]; float embedding_value = embedding_ptr[embedding_idx]; grad_value = grad_value + weight_decay * embedding_value; float v = v_ptr[embedding_idx]; diff --git a/cpp/tests/wholememory_ops/wholememory_embedding_gradient_apply_tests.cu b/cpp/tests/wholememory_ops/wholememory_embedding_gradient_apply_tests.cu index 453b13b41..bb6360fc0 100644 --- a/cpp/tests/wholememory_ops/wholememory_embedding_gradient_apply_tests.cu +++ b/cpp/tests/wholememory_ops/wholememory_embedding_gradient_apply_tests.cu @@ -149,7 +149,7 @@ struct EmbeddingBackwardTestParams { wholememory_optimizer_type_t optimizer_type = WHOLEMEMORY_OPT_SGD; float cache_ratio = 0.2; bool use_cache = false; - int run_count = 1; + int run_count = 3; float lr_ = 0.1; @@ -428,7 +428,7 @@ void prepare_data_and_reference( int64_t end_entry = (thread_rank + 1) * total_entry_count / thread_world_size; CPUOptimizer cpu_optimizer(¶ms, start_entry, end_entry); int embedding_dim = params.grad_description.sizes[1]; - for (int step = 0; step <= params.run_count; step++) { + for (int step = 0; step < params.run_count; step++) { int step_id = std::min(step, params.run_count - 1); std::vector indices; std::vector> grads; @@ -625,7 +625,7 @@ TEST_P(WholeMemoryEmbeddingBackwardParameterTests, EmbeddingGatherGradientApplyT EXPECT_EQ(cudaStreamSynchronize(nullptr), cudaSuccess); EXPECT_EQ(wholememory_communicator_barrier(wm_comm), WHOLEMEMORY_SUCCESS); - for (int run = 0; run <= params.run_count; run++) { + for (int run = 0; run < params.run_count; run++) { int step_id = std::min(run, params.run_count - 1); auto& rank_indices_vec = step_rank_indices[step_id][world_rank]; auto& rank_grads_vec = step_rank_grads[step_id][world_rank]; @@ -737,6 +737,8 @@ INSTANTIATE_TEST_SUITE_P( EmbeddingBackwardTestParams().set_use_cache().set_indice_count(10000127).set_optimizer_type(WHOLEMEMORY_OPT_ADAGRAD), EmbeddingBackwardTestParams().set_use_cache().set_indice_count(10000127).set_optimizer_type(WHOLEMEMORY_OPT_LAZY_ADAM), #endif + EmbeddingBackwardTestParams().set_entry_count(500).set_indice_count(400).set_embedding_dim(4), + EmbeddingBackwardTestParams().set_embedding_dim(3), EmbeddingBackwardTestParams().set_use_cache().set_grad_stride(131), EmbeddingBackwardTestParams().set_use_cache().set_grad_stride(131).set_optimizer_type( WHOLEMEMORY_OPT_RMSPROP),