From c43f6d173cdc15131176ed35c7fd93fbc5e14c06 Mon Sep 17 00:00:00 2001
From: Bradley Dice <bdice@bradleydice.com>
Date: Mon, 18 Dec 2023 12:45:53 -0600
Subject: [PATCH 1/7] Align versions for cudnn, clang-tools, cython, and
 doxygen with the rest of RAPIDS. (#112)

This PR aligns versions for wholegraph dependencies with versions used by other RAPIDS packages. This is needed for devcontainers, to make the unified RAPIDS conda environment solvable. See https://github.com/rapidsai/devcontainers/pull/191.

Authors:
  - Bradley Dice (https://github.com/bdice)

Approvers:
  - AJ Schmidt (https://github.com/ajschmidt8)
  - Ray Douglass (https://github.com/raydouglass)

URL: https://github.com/rapidsai/wholegraph/pull/112
---
 conda/environments/all_cuda-118_arch-x86_64.yaml | 10 +++++-----
 conda/environments/all_cuda-120_arch-x86_64.yaml | 10 +++++-----
 dependencies.yaml                                | 14 +++++++-------
 python/pylibwholegraph/pyproject.toml            |  2 +-
 4 files changed, 18 insertions(+), 18 deletions(-)

diff --git a/conda/environments/all_cuda-118_arch-x86_64.yaml b/conda/environments/all_cuda-118_arch-x86_64.yaml
index 292c809d4..c7410ce71 100644
--- a/conda/environments/all_cuda-118_arch-x86_64.yaml
+++ b/conda/environments/all_cuda-118_arch-x86_64.yaml
@@ -9,15 +9,15 @@ channels:
 dependencies:
 - breathe
 - c-compiler
-- clang-tools=16.0.0
-- clangxx=16.0.0
+- clang-tools==16.0.6
+- clangxx==16.0.6
 - cmake>=3.26.4
 - cuda-nvtx=11.8
 - cudatoolkit=11.8
-- cudnn=8.4
+- cudnn=8.8
 - cxx-compiler
-- cython
-- doxygen=1.8.20
+- cython>=3.0.0
+- doxygen==1.9.1
 - gcc_linux-64=11.*
 - gitpython
 - graphviz
diff --git a/conda/environments/all_cuda-120_arch-x86_64.yaml b/conda/environments/all_cuda-120_arch-x86_64.yaml
index b436d5641..568c312b4 100644
--- a/conda/environments/all_cuda-120_arch-x86_64.yaml
+++ b/conda/environments/all_cuda-120_arch-x86_64.yaml
@@ -9,17 +9,17 @@ channels:
 dependencies:
 - breathe
 - c-compiler
-- clang-tools=16.0.0
-- clangxx=16.0.0
+- clang-tools==16.0.6
+- clangxx==16.0.6
 - cmake>=3.26.4
 - cuda-cudart-dev
 - cuda-nvcc
 - cuda-nvtx
 - cuda-version=12.0
-- cudnn=8.4
+- cudnn=8.8
 - cxx-compiler
-- cython
-- doxygen=1.8.20
+- cython>=3.0.0
+- doxygen==1.9.1
 - gcc_linux-64=11.*
 - gitpython
 - graphviz
diff --git a/dependencies.yaml b/dependencies.yaml
index 71c0dc93d..2a52b450f 100644
--- a/dependencies.yaml
+++ b/dependencies.yaml
@@ -68,10 +68,10 @@ dependencies:
         packages:
           - c-compiler
           - cmake>=3.26.4
-          - cudnn=8.4
+          - cudnn=8.8
           - cxx-compiler
-          - cython
-          - doxygen=1.8.20
+          - cython>=3.0.0
+          - &doxygen doxygen==1.9.1
           - libraft-headers==24.2.*
           - librmm==24.2.*
           - nanobind>=0.2.0
@@ -252,7 +252,7 @@ dependencies:
       - output_types: [conda, requirements]
         packages:
           - breathe
-          - doxygen=1.8.20
+          - *doxygen
           - graphviz
           - ipython
           - ipykernel
@@ -274,15 +274,15 @@ dependencies:
     common:
       - output_types: [conda, requirements]
         packages:
-          - clangxx=16.0.0
-          - clang-tools=16.0.0
+          - clangxx==16.0.6
+          - clang-tools==16.0.6
           - gitpython
   python_build_wheel:
     common:
       - output_types: [pyproject]
         packages:
           - cmake>=3.26.4
-          - cython>=0.29,<0.30
+          - cython>=3.0.0
           - ninja
           - setuptools
           - scikit-build>=0.13.1
diff --git a/python/pylibwholegraph/pyproject.toml b/python/pylibwholegraph/pyproject.toml
index ccb14b831..910d4ccea 100644
--- a/python/pylibwholegraph/pyproject.toml
+++ b/python/pylibwholegraph/pyproject.toml
@@ -15,7 +15,7 @@
 [build-system]
 requires = [
     "cmake>=3.26.4",
-    "cython>=0.29,<0.30",
+    "cython>=3.0.0",
     "ninja",
     "scikit-build>=0.13.1",
     "setuptools",

From ef5b3eef3f89dc2c9536fe9abce2d0c93cc9a364 Mon Sep 17 00:00:00 2001
From: Vyas Ramasubramani <vyas.ramasubramani@gmail.com>
Date: Tue, 19 Dec 2023 09:28:02 -0800
Subject: [PATCH 2/7] Don't overwrite wholegraph_ROOT if provided (#114)

This change allows standard CMake specification of the C++ package directory (via `-Dwholegraph_ROOT`) to also work during the Python build.

Authors:
  - Vyas Ramasubramani (https://github.com/vyasr)

Approvers:
  - Bradley Dice (https://github.com/bdice)
  - Ray Douglass (https://github.com/raydouglass)

URL: https://github.com/rapidsai/wholegraph/pull/114
---
 python/pylibwholegraph/CMakeLists.txt | 4 +++-
 1 file changed, 3 insertions(+), 1 deletion(-)

diff --git a/python/pylibwholegraph/CMakeLists.txt b/python/pylibwholegraph/CMakeLists.txt
index a8cd320e0..758fe3713 100644
--- a/python/pylibwholegraph/CMakeLists.txt
+++ b/python/pylibwholegraph/CMakeLists.txt
@@ -113,7 +113,9 @@ include(../../cpp/cmake/thirdparty/get_raft.cmake)
 #include(${CMAKE_CURRENT_LIST_DIR}/../cmake/thirdparty/nanobind.cmake)
 
 # use <package>_ROOT here to take precedence over any other package
-set(wholegraph_ROOT "$ENV{LIBWHOLEGRAPH_DIR}")
+if (DEFINED ENV{LIBWHOLEGRAPH_DIR})
+  set(wholegraph_ROOT "$ENV{LIBWHOLEGRAPH_DIR}")
+endif()
 find_package(wholegraph "${RAPIDS_VERSION}.0" EXACT)
 message("WholeGraph")
 if (WHOLEGRAPH_FOUND)

From aaceac529dc3a87286fd1cb2a435f42d3404d735 Mon Sep 17 00:00:00 2001
From: dongxuy04 <78518666+dongxuy04@users.noreply.github.com>
Date: Tue, 9 Jan 2024 23:16:01 +0800
Subject: [PATCH 3/7] added Direct IO support for WholeMemory loading (#113)

Add Direct IO support option for WholeMemory loading from disk.
Using Direct IO may be faster on some high performance file systems.

Authors:
  - https://github.com/dongxuy04
  - Brad Rees (https://github.com/BradReesWork)

Approvers:
  - Brad Rees (https://github.com/BradReesWork)

URL: https://github.com/rapidsai/wholegraph/pull/113
---
 cpp/src/wholememory/file_io.cpp | 469 +++++++++++++++++++++++++-------
 1 file changed, 372 insertions(+), 97 deletions(-)

diff --git a/cpp/src/wholememory/file_io.cpp b/cpp/src/wholememory/file_io.cpp
index 3274811e1..0540a3f5d 100644
--- a/cpp/src/wholememory/file_io.cpp
+++ b/cpp/src/wholememory/file_io.cpp
@@ -15,14 +15,17 @@
  */
 #include "file_io.h"
 
+#include <fcntl.h>
 #include <sys/stat.h>
 #include <unistd.h>
 
+#include <cstdlib>
 #include <string>
 #include <vector>
 
 #include "communicator.hpp"
 #include "error.hpp"
+#include "integer_utils.hpp"
 #include "logger.hpp"
 
 namespace wholememory {
@@ -38,6 +41,15 @@ static size_t StatFileSize(const char* filename)
   return filesize;
 }
 
+static size_t StatFileBlockSize(const char* filename)
+{
+  auto blocksize = static_cast<size_t>(-1);
+  struct stat statbuf {};
+  if (stat(filename, &statbuf) < 0) { return blocksize; }
+  blocksize = statbuf.st_blksize;
+  return blocksize;
+}
+
 static size_t get_handle_partial_size(size_t handle_size,
                                       size_t memory_offset,
                                       size_t memory_entry_stride,
@@ -62,6 +74,317 @@ static size_t get_handle_partial_size(size_t handle_size,
   return partial_size;
 }
 
+/*!
+ * Read from file list to local memory of WholeMemory. File list are binary files, which are
+ * considered to be concatenated together. All ranks in WholeMemory will read the files in parallel
+ * and load each part into local memory of each rank.
+ * @param local_ptr : Pointer to local memory of WholeMemory
+ * @param local_size : Local memory size
+ * @param local_offset : The offset of local memory in WholeMemory.
+ * @param entry_size : The entry size of each data entry.
+ * @param memory_entry_stride : The stride of each entry in WholeMemory
+ * @param memory_offset : The start offset to place the read data. Should be in range [0,
+ * memory_entry_stride)
+ * @param file_count : Total file count of the file list
+ * @param file_names : File names of the file list.
+ * @param file_sizes : Sizes of each file.
+ * @param suggested_buffer_size : Suggested buffer size to read.
+ * @param wm_rank : WholeMemory rank.
+ */
+static void read_file_list_to_local_memory(char* local_ptr,
+                                           size_t local_size,
+                                           size_t local_offset,
+                                           size_t entry_size,
+                                           size_t memory_entry_stride,
+                                           size_t memory_offset,
+                                           int file_count,
+                                           const char** file_names,
+                                           const std::vector<size_t>& file_sizes,
+                                           size_t suggested_buffer_size,
+                                           int wm_rank)
+{
+  size_t buffer_size;
+  size_t buffer_entry_count = 1;
+  if (suggested_buffer_size < entry_size) {
+    buffer_size = entry_size;
+  } else {
+    buffer_entry_count = suggested_buffer_size / entry_size;
+    buffer_size        = buffer_entry_count * entry_size;
+  }
+  std::vector<char> file_read_buffer(buffer_size);
+
+  size_t local_entry_memory_start_index = local_offset / memory_entry_stride;
+  size_t local_entry_file_start_index =
+    local_entry_memory_start_index - memory_offset / memory_entry_stride;
+  size_t local_entry_count = local_size / memory_entry_stride;
+  char* local_write_ptr    = local_ptr + memory_offset % memory_entry_stride;
+  if (wm_rank == 0) {
+    local_entry_count -= memory_offset / memory_entry_stride;
+    local_write_ptr += (memory_offset / memory_entry_stride) * memory_entry_stride;
+  }
+  size_t local_entry_idx = 0;
+
+  size_t file_entry_offset = 0;
+  size_t total_read_bytes  = 0;
+  for (int i = 0; i < file_count; i++) {
+    size_t file_entry_count = file_sizes[i] / entry_size;
+    // already outside reading window
+    if (file_entry_offset >= local_entry_file_start_index + local_entry_count) break;
+    // in reading window
+    if (file_entry_offset + file_entry_count > local_entry_file_start_index) {
+      size_t file_read_start_offset = 0;
+      FILE* fp                      = fopen(file_names[i], "rb");
+      if (fp == nullptr) { WHOLEMEMORY_ERROR("Open file %s for read failed.", file_names[i]); }
+      // maybe in window end, remove possible tailing data that don't belong to current rank.
+      size_t to_read_file_entry_count = std::min(
+        file_entry_count, local_entry_file_start_index + local_entry_count - file_entry_offset);
+      // if in window begin, remove possible data that belongs to previous rank and skip disk
+      // data.
+      if (file_entry_offset < local_entry_file_start_index) {
+        size_t skip_entry_count = local_entry_file_start_index - file_entry_offset;
+
+        file_read_start_offset = skip_entry_count * entry_size;
+
+        if (fseeko(fp, file_read_start_offset, SEEK_SET) != 0) {
+          WHOLEMEMORY_ERROR(
+            "File %s seek to %ld failed.", file_names[i], skip_entry_count * entry_size);
+        }
+        to_read_file_entry_count -= skip_entry_count;
+      }
+      // now all data in file_entry_count need to be read.
+      size_t bytes_to_read    = to_read_file_entry_count * entry_size;
+      size_t left_entry_count = to_read_file_entry_count;
+      while (left_entry_count > 0) {
+        size_t read_entry_count = std::min(left_entry_count, buffer_entry_count);
+
+        int ret = fread(file_read_buffer.data(), entry_size, read_entry_count, fp);
+        if (ret != read_entry_count) {
+          WHOLEMEMORY_ERROR(
+            "File %s line %d: reading from file %s, read_entry_count=%ld, entry_size=%ld, "
+            "returned %d, error=%s\n",
+            __FILE__,
+            __LINE__,
+            file_names[i],
+            read_entry_count,
+            entry_size,
+            ret,
+            strerror(errno));
+        }
+
+        if (entry_size != memory_entry_stride) {
+          WM_CUDA_CHECK(cudaMemcpy2D(local_write_ptr,
+                                     memory_entry_stride,
+                                     file_read_buffer.data(),
+                                     entry_size,
+                                     entry_size,
+                                     read_entry_count,
+                                     cudaMemcpyDefault));
+        } else {
+          WM_CUDA_CHECK(cudaMemcpy(local_write_ptr,
+                                   file_read_buffer.data(),
+                                   read_entry_count * entry_size,
+                                   cudaMemcpyDefault));
+        }
+        local_write_ptr += read_entry_count * memory_entry_stride;
+
+        left_entry_count -= read_entry_count;
+      }
+      fclose(fp);
+      WHOLEMEMORY_INFO(
+        "Rank=%d done Reading %ld bytes from file %s size=%ld, starting from offset=%ld.",
+        wm_rank,
+        bytes_to_read,
+        file_names[i],
+        file_sizes[i],
+        file_read_start_offset);
+      total_read_bytes += bytes_to_read;
+    }
+    file_entry_offset += file_entry_count;
+  }
+  WHOLEMEMORY_INFO(
+    "Rank=%d done reading total %ld bytes from needed files.", wm_rank, total_read_bytes);
+}
+
+/*!
+ * Read from file list to local memory of WholeMemory using DirectIO. Using DirectIO may have better
+ * performance by bypassing system cache if it is bottleneck. File list are binary files, which are
+ * considered to be concatenated together. All ranks in WholeMemory will read the files in parallel
+ * and load each part into local memory of each rank.
+ * @param local_ptr : Pointer to local memory of WholeMemory
+ * @param local_size : Local memory size
+ * @param local_offset : The offset of local memory in WholeMemory.
+ * @param entry_size : The entry size of each data entry.
+ * @param memory_entry_stride : The stride of each entry in WholeMemory
+ * @param memory_offset : The start offset to place the read data. Should be in range [0,
+ * memory_entry_stride)
+ * @param file_count : Total file count of the file list
+ * @param file_names : File names of the file list.
+ * @param file_sizes : Sizes of each file.
+ * @param suggested_buffer_size : Suggested buffer size to read.
+ * @param wm_rank : WholeMemory rank.
+ */
+static void read_file_list_to_local_memory_directio(char* local_ptr,
+                                                    size_t local_size,
+                                                    size_t local_offset,
+                                                    size_t entry_size,
+                                                    size_t memory_entry_stride,
+                                                    size_t memory_offset,
+                                                    int file_count,
+                                                    const char** file_names,
+                                                    const std::vector<size_t>& file_sizes,
+                                                    size_t suggested_buffer_size,
+                                                    int wm_rank)
+{
+  if (memory_offset + entry_size > memory_entry_stride) {
+    WHOLEMEMORY_FAIL_NOTHROW("Direct io mode only support reading all entries.");
+  }
+  size_t local_entry_start_index = local_offset / memory_entry_stride;
+  size_t local_entry_count       = local_size / memory_entry_stride;
+  char* local_write_ptr          = local_ptr + memory_offset % memory_entry_stride;
+
+  static size_t kAlignSize = 16 * 1024 * 1024;
+  suggested_buffer_size    = round_up_unsafe<size_t>(suggested_buffer_size, kAlignSize);
+
+  char* block_buffer;
+  WHOLEMEMORY_CHECK_NOTHROW(posix_memalign(reinterpret_cast<void**>(&block_buffer),
+                                           kAlignSize,
+                                           suggested_buffer_size) == 0);
+
+  size_t file_entry_offset = 0;
+  size_t read_entry_count  = 0;
+  for (int i = 0; i < file_count; i++) {
+    size_t file_entry_count = file_sizes[i] / entry_size;
+    // already outside reading window
+    if (file_entry_offset >= local_entry_start_index + local_entry_count) break;
+    // reading window not reached
+    if (file_entry_offset + file_entry_count <= local_entry_start_index) {
+      file_entry_offset += file_entry_count;
+      continue;
+    }
+    // in reading window
+    auto block_size = StatFileBlockSize(file_names[i]);
+    if (block_size == 0 || block_size == (size_t)-1 || kAlignSize % block_size != 0) {
+      WHOLEMEMORY_FAIL_NOTHROW(
+        "block_size=%ld for file %s, but alignment is %ld", block_size, file_names[i], kAlignSize);
+    }
+    size_t buffer_block_count = suggested_buffer_size / block_size;
+    int fd                    = open(file_names[i], O_DIRECT | O_RDONLY);
+    if (fd < 0) { WHOLEMEMORY_FAIL_NOTHROW("Open file %s with direct io failed.", file_names[i]); }
+
+    // maybe in window end, remove possible tailing data that don't belong to current rank.
+    size_t to_read_file_entry_count =
+      std::min(file_entry_count, local_entry_start_index + local_entry_count - file_entry_offset);
+
+    size_t file_read_end = to_read_file_entry_count * entry_size;
+    // if in window begin, remove possible data that belongs to previous rank and skip disk
+    // data.
+    size_t file_read_start = 0;
+    if (file_entry_offset < local_entry_start_index) {
+      size_t skip_entry_count = local_entry_start_index - file_entry_offset;
+      to_read_file_entry_count -= skip_entry_count;
+      file_read_start = skip_entry_count * entry_size;
+    }
+
+    size_t file_block_read_offset = file_read_start / block_size * block_size;
+    size_t skip_head_size         = file_read_start - file_block_read_offset;
+
+    char* local_mem_write_entry_for_file = local_write_ptr + read_entry_count * memory_entry_stride;
+    size_t first_mem_entry_offset        = 0;
+    size_t useful_data_bytes_read        = 0;
+    size_t physical_data_bytes_read      = 0;
+    while (file_block_read_offset < file_read_end) {
+      size_t left_size          = file_read_end - file_block_read_offset;
+      size_t left_block_count   = div_rounding_up_unsafe(left_size, block_size);
+      size_t read_block_count   = std::min(left_block_count, buffer_block_count);
+      size_t physical_read_size = read_block_count * block_size;
+      physical_data_bytes_read += physical_read_size;
+
+      ssize_t pread_size = pread64(fd, block_buffer, physical_read_size, file_block_read_offset);
+      if (pread_size != physical_read_size &&
+          file_block_read_offset + pread_size != file_sizes[i]) {
+        WHOLEMEMORY_FAIL_NOTHROW(
+          "rank=%d, pread_size=%ld, physical_read_size=%ld, file_block_read_offset=%ld, "
+          "file_sizes[i]=%ld, file=%s",
+          wm_rank,
+          pread_size,
+          physical_read_size,
+          file_block_read_offset,
+          file_sizes[i],
+          file_names[i]);
+      }
+
+      size_t drop_tail_size = 0;
+      if (file_block_read_offset + physical_read_size > file_read_end) {
+        drop_tail_size = file_block_read_offset + physical_read_size - file_read_end;
+      }
+
+      char* useful_data_ptr   = block_buffer + skip_head_size;
+      size_t useful_data_size = physical_read_size - skip_head_size - drop_tail_size;
+
+      useful_data_bytes_read += useful_data_size;
+
+      if (first_mem_entry_offset != 0) {
+        // process head
+        size_t entry_left_size = entry_size - first_mem_entry_offset;
+        WM_CUDA_CHECK_NO_THROW(cudaMemcpy(local_mem_write_entry_for_file + first_mem_entry_offset,
+                                          useful_data_ptr,
+                                          entry_left_size,
+                                          cudaMemcpyDefault));
+        local_mem_write_entry_for_file += memory_entry_stride;
+        useful_data_ptr += entry_left_size;
+        useful_data_size -= entry_left_size;
+        entry_left_size = 0;
+      }
+
+      size_t full_entry_count = useful_data_size / entry_size;
+      size_t full_entry_size  = full_entry_count * entry_size;
+
+      if (full_entry_size > 0) {
+        if (entry_size != memory_entry_stride) {
+          WM_CUDA_CHECK(cudaMemcpy2D(local_mem_write_entry_for_file,
+                                     memory_entry_stride,
+                                     useful_data_ptr,
+                                     entry_size,
+                                     entry_size,
+                                     full_entry_count,
+                                     cudaMemcpyDefault));
+        } else {
+          WM_CUDA_CHECK(cudaMemcpy(
+            local_mem_write_entry_for_file, useful_data_ptr, full_entry_size, cudaMemcpyDefault));
+        }
+        local_mem_write_entry_for_file += memory_entry_stride * full_entry_count;
+        useful_data_ptr += full_entry_size;
+        useful_data_size -= full_entry_size;
+      }
+
+      size_t tail_entry_size = useful_data_size % entry_size;
+      if (tail_entry_size != 0) {
+        // process tail
+        WM_CUDA_CHECK_NO_THROW(cudaMemcpy(
+          local_mem_write_entry_for_file, useful_data_ptr, tail_entry_size, cudaMemcpyDefault));
+        first_mem_entry_offset = tail_entry_size;
+      }
+
+      file_block_read_offset += physical_read_size;
+      skip_head_size = 0;
+    }
+
+    WHOLEMEMORY_INFO(
+      "Rank=%d done Reading %ld useful bytes by reading %ld block bytes using DirectIO from file "
+      "%s size=%ld.",
+      wm_rank,
+      useful_data_bytes_read,
+      physical_data_bytes_read,
+      file_names[i],
+      file_sizes[i]);
+
+    close(fd);
+    file_entry_offset += file_entry_count;
+    read_entry_count += to_read_file_entry_count;
+  }
+  free(block_buffer);
+}
+
 wholememory_error_code_t load_file_to_handle(wholememory_handle_t wholememory_handle,
                                              size_t memory_offset,
                                              size_t memory_entry_stride,
@@ -153,107 +476,59 @@ wholememory_error_code_t load_file_to_handle(wholememory_handle_t wholememory_ha
                         (void**)(&local_ptr), &local_size, &local_offset, wholememory_handle) ==
                       WHOLEMEMORY_SUCCESS);
 
-    constexpr int kSuggestedBufferSize = 16 * 1024 * 1024;
-    size_t buffer_size;
-    size_t buffer_entry_count = 1;
-    if (kSuggestedBufferSize < entry_size) {
-      buffer_size = entry_size;
-    } else {
-      buffer_entry_count = kSuggestedBufferSize / entry_size;
-      buffer_size        = buffer_entry_count * entry_size;
+    int suggested_buffer_size_mb    = 16;
+    const char* buffer_size_env_var = std::getenv("WG_LOAD_BUFFER_SIZE_MB");
+    if (buffer_size_env_var != nullptr) {
+      try {
+        suggested_buffer_size_mb = std::stoi(buffer_size_env_var);
+      } catch (const std::invalid_argument& e) {
+        suggested_buffer_size_mb = 16;
+        WHOLEMEMORY_WARN(
+          "Environment variable WG_LOAD_BUFFER_SIZE_MB value %s is not valid, using default %d",
+          buffer_size_env_var,
+          suggested_buffer_size_mb);
+      }
+      if (suggested_buffer_size_mb < 1) {
+        suggested_buffer_size_mb = 16;
+        WHOLEMEMORY_WARN(
+          "Environment variable WG_LOAD_BUFFER_SIZE_MB value %s is not valid, using default %d",
+          buffer_size_env_var,
+          suggested_buffer_size_mb);
+      }
     }
-    std::vector<char> file_read_buffer(buffer_size);
+    size_t suggested_buffer_size = static_cast<size_t>(suggested_buffer_size_mb) * 1024 * 1024;
 
-    size_t local_entry_memory_start_index = local_offset / memory_entry_stride;
-    size_t local_entry_file_start_index =
-      local_entry_memory_start_index - memory_offset / memory_entry_stride;
-    size_t local_entry_count = local_size / memory_entry_stride;
-    char* local_write_ptr    = local_ptr + memory_offset % memory_entry_stride;
-    if (wm_rank == 0) {
-      local_entry_count -= memory_offset / memory_entry_stride;
-      local_write_ptr += (memory_offset / memory_entry_stride) * memory_entry_stride;
+    const char* directio_env_var = std::getenv("WG_LOAD_USE_DIRECTIO");
+    bool use_direct_io           = false;
+    if (directio_env_var != nullptr && directio_env_var[0] == '1' && directio_env_var[1] == '\0') {
+      use_direct_io = true;
     }
-    size_t local_entry_idx = 0;
-
-    size_t file_entry_offset = 0;
-    size_t total_read_bytes  = 0;
-    for (int i = 0; i < file_count; i++) {
-      size_t file_entry_count = file_sizes[i] / entry_size;
-      // already outside reading window
-      if (file_entry_offset >= local_entry_file_start_index + local_entry_count) break;
-      // in reading window
-      if (file_entry_offset + file_entry_count > local_entry_file_start_index) {
-        size_t file_read_start_offset = 0;
-        FILE* fp                      = fopen(file_names[i], "rb");
-        if (fp == nullptr) { WHOLEMEMORY_ERROR("Open file %s for read failed.", file_names[i]); }
-        // maybe in window end, remove possible tailing data that don't belong to current rank.
-        size_t to_read_file_entry_count = std::min(
-          file_entry_count, local_entry_file_start_index + local_entry_count - file_entry_offset);
-        // if in window begin, remove possible data that belongs to previous rank and skip disk
-        // data.
-        if (file_entry_offset < local_entry_file_start_index) {
-          size_t skip_entry_count = local_entry_file_start_index - file_entry_offset;
-
-          file_read_start_offset = skip_entry_count * entry_size;
-
-          if (fseeko(fp, file_read_start_offset, SEEK_SET) != 0) {
-            WHOLEMEMORY_ERROR(
-              "File %s seek to %ld failed.", file_names[i], skip_entry_count * entry_size);
-          }
-          to_read_file_entry_count -= skip_entry_count;
-        }
-        // now all data in file_entry_count need to be read.
-        size_t bytes_to_read    = to_read_file_entry_count * entry_size;
-        size_t left_entry_count = to_read_file_entry_count;
-        while (left_entry_count > 0) {
-          size_t read_entry_count = std::min(left_entry_count, buffer_entry_count);
-
-          int ret = fread(file_read_buffer.data(), entry_size, read_entry_count, fp);
-          if (ret != read_entry_count) {
-            WHOLEMEMORY_ERROR(
-              "File %s line %d: reading from file %s, read_entry_count=%ld, entry_size=%ld, "
-              "returned %d, error=%s\n",
-              __FILE__,
-              __LINE__,
-              file_names[i],
-              read_entry_count,
-              entry_size,
-              ret,
-              strerror(errno));
-          }
-
-          if (entry_size != memory_entry_stride) {
-            WM_CUDA_CHECK(cudaMemcpy2D(local_write_ptr,
-                                       memory_entry_stride,
-                                       file_read_buffer.data(),
-                                       entry_size,
-                                       entry_size,
-                                       read_entry_count,
-                                       cudaMemcpyDefault));
-          } else {
-            WM_CUDA_CHECK(cudaMemcpy(local_write_ptr,
-                                     file_read_buffer.data(),
-                                     read_entry_count * entry_size,
-                                     cudaMemcpyDefault));
-          }
-          local_write_ptr += read_entry_count * memory_entry_stride;
-
-          left_entry_count -= read_entry_count;
-        }
-        fclose(fp);
-        WHOLEMEMORY_INFO(
-          "Rank=%d done Reading %ld bytes from file %s size=%ld, starting from offset=%ld.",
-          wm_rank,
-          bytes_to_read,
-          file_names[i],
-          file_sizes[i],
-          file_read_start_offset);
-        total_read_bytes += bytes_to_read;
-      }
-      file_entry_offset += file_entry_count;
+    if (!use_direct_io) {
+      read_file_list_to_local_memory(local_ptr,
+                                     local_size,
+                                     local_offset,
+                                     entry_size,
+                                     memory_entry_stride,
+                                     memory_offset,
+                                     file_count,
+                                     file_names,
+                                     file_sizes,
+                                     suggested_buffer_size,
+                                     wm_rank);
+    } else {
+      read_file_list_to_local_memory_directio(local_ptr,
+                                              local_size,
+                                              local_offset,
+                                              entry_size,
+                                              memory_entry_stride,
+                                              memory_offset,
+                                              file_count,
+                                              file_names,
+                                              file_sizes,
+                                              suggested_buffer_size,
+                                              wm_rank);
     }
-    WHOLEMEMORY_INFO(
-      "Rank=%d done reading total %ld bytes from needed files.", wm_rank, total_read_bytes);
+
     wm_comm->barrier();
   } catch (wholememory::logic_error& wle) {
     WHOLEMEMORY_ERROR("Logic error: %s", wle.what());

From 7025eafa75567f7353ae97b5f9077ef9f9879649 Mon Sep 17 00:00:00 2001
From: James Lamb <jaylamb20@gmail.com>
Date: Thu, 11 Jan 2024 13:54:01 -0600
Subject: [PATCH 4/7] refactor CUDA versions in dependencies.yaml (#115)

Contributes to https://github.com/rapidsai/build-planning/issues/7.

Proposes splitting the `cuda-version` dependency in `dependencies.yaml` out to its own thing, separate from the bits of the CUDA Toolkit this project needs.

### Benefits of this change

* prevents accidental inclusion of multiple `cuda-version` version in environments
* reduces update effort (via enabling more use of globs like `"12.*"`)
* improves the chance that errors like "`conda` recipe is missing a dependency" are caught in CI

### Notes for Reviewers

This change was intended to just re-organize `dependencies.yaml`, but I do think the one additional change it introduces to `all_cuda-118_arch-x86_64.yaml` is a good one. I *think* requiring the `cuda-version` metapackage in all environments is useful to prevent against environment solves that result in runtime issues.

References:

* https://github.com/conda-forge/cuda-version-feedstock/blob/902045016fbc6e4dd1350a7390b0411f376d1a19/recipe/meta.yaml#L13-L18
* https://docs.conda.io/projects/conda-build/en/stable/resources/define-metadata.html#run-constrained

Authors:
  - James Lamb (https://github.com/jameslamb)

Approvers:
  - Ray Douglass (https://github.com/raydouglass)
  - Bradley Dice (https://github.com/bdice)

URL: https://github.com/rapidsai/wholegraph/pull/115
---
 .pre-commit-config.yaml                       |  2 +-
 .../all_cuda-118_arch-x86_64.yaml             |  3 +-
 dependencies.yaml                             | 50 ++++++++++++++-----
 3 files changed, 40 insertions(+), 15 deletions(-)

diff --git a/.pre-commit-config.yaml b/.pre-commit-config.yaml
index 6943ae3b0..eef7a0285 100644
--- a/.pre-commit-config.yaml
+++ b/.pre-commit-config.yaml
@@ -40,7 +40,7 @@ repos:
         pass_filenames: false
         additional_dependencies: [gitpython]
   - repo: https://github.com/rapidsai/dependency-file-generator
-    rev: v1.5.1
+    rev: v1.8.0
     hooks:
       - id: rapids-dependency-file-generator
         args: ["--clean"]
diff --git a/conda/environments/all_cuda-118_arch-x86_64.yaml b/conda/environments/all_cuda-118_arch-x86_64.yaml
index c7410ce71..825ec1f7d 100644
--- a/conda/environments/all_cuda-118_arch-x86_64.yaml
+++ b/conda/environments/all_cuda-118_arch-x86_64.yaml
@@ -13,7 +13,8 @@ dependencies:
 - clangxx==16.0.6
 - cmake>=3.26.4
 - cuda-nvtx=11.8
-- cudatoolkit=11.8
+- cuda-version=11.8
+- cudatoolkit
 - cudnn=8.8
 - cxx-compiler
 - cython>=3.0.0
diff --git a/dependencies.yaml b/dependencies.yaml
index 2a52b450f..17ed61598 100644
--- a/dependencies.yaml
+++ b/dependencies.yaml
@@ -8,7 +8,8 @@ files:
     includes:
       - checks
       - build
-      - cudatoolkit
+      - cuda
+      - cuda_version
       - py_version
       - run
       - test_python
@@ -17,11 +18,11 @@ files:
   test_cpp:
     output: none
     includes:
-      - cudatoolkit
+      - cuda_version
   test_python:
     output: none
     includes:
-      - cudatoolkit
+      - cuda_version
       - py_version
       - test_python
   checks:
@@ -32,7 +33,7 @@ files:
   docs:
     output: none
     includes:
-      - cudatoolkit
+      - cuda_version
       - docs
       - py_version
       - pytorch_cpu
@@ -40,7 +41,8 @@ files:
     output: none
     includes:
       - build
-      - cudatoolkit
+      - cuda
+      - cuda_version
       - py_version
       - run
       - pytorch_cpu
@@ -107,39 +109,61 @@ dependencies:
               cuda: "11.8"
             packages:
               - nvcc_linux-aarch64=11.8
+          - matrix:
+              cuda: "12.*"
+            packages:
+              - cuda-nvcc
+  cuda_version:
+    specific:
+      - output_types: conda
+        matrices:
+          - matrix:
+              cuda: "11.2"
+            packages:
+              - cuda-version=11.2
+          - matrix:
+              cuda: "11.4"
+            packages:
+              - cuda-version=11.4
+          - matrix:
+              cuda: "11.5"
+            packages:
+              - cuda-version=11.5
+          - matrix:
+              cuda: "11.8"
+            packages:
+              - cuda-version=11.8
           - matrix:
               cuda: "12.0"
             packages:
               - cuda-version=12.0
-              - cuda-nvcc
-  cudatoolkit:
+  cuda:
     specific:
       - output_types: conda
         matrices:
           - matrix:
               cuda: "11.2"
             packages:
-              - cudatoolkit=11.2
+              - cudatoolkit
               - cuda-nvtx=11.4 # oldest available
           - matrix:
               cuda: "11.4"
             packages:
-              - cudatoolkit=11.4
+              - cudatoolkit
               - cuda-nvtx=11.4 # oldest available
           - matrix:
               cuda: "11.5"
             packages:
-              - cudatoolkit=11.5
+              - cudatoolkit
               - cuda-nvtx=11.5
           - matrix:
               cuda: "11.8"
             packages:
-              - cudatoolkit=11.8
+              - cudatoolkit
               - cuda-nvtx=11.8
           - matrix:
-              cuda: "12.0"
+              cuda: "12.*"
             packages:
-              - cuda-version=12.0
               - cuda-cudart-dev
               - cuda-nvtx
   checks:

From 0ddab62f9e3c44a51cc9755ce3a7b10caef6fb40 Mon Sep 17 00:00:00 2001
From: Kyle Edwards <kyedwards@nvidia.com>
Date: Fri, 12 Jan 2024 11:55:40 -0500
Subject: [PATCH 5/7] Remove usages of rapids-env-update (#117)

Reference: https://github.com/rapidsai/ops/issues/2766

Replace rapids-env-update with rapids-configure-conda-channels,
rapids-configure-sccache, and rapids-date-string.

Authors:
  - Kyle Edwards (https://github.com/KyleFromNVIDIA)

Approvers:
  - AJ Schmidt (https://github.com/ajschmidt8)

URL: https://github.com/rapidsai/wholegraph/pull/117
---
 ci/build_cpp.sh    | 6 +++++-
 ci/build_python.sh | 6 +++++-
 2 files changed, 10 insertions(+), 2 deletions(-)

diff --git a/ci/build_cpp.sh b/ci/build_cpp.sh
index 4e1b7bd2a..bd45a3f57 100755
--- a/ci/build_cpp.sh
+++ b/ci/build_cpp.sh
@@ -3,7 +3,11 @@
 
 set -euo pipefail
 
-source rapids-env-update
+rapids-configure-conda-channels
+
+source rapids-configure-sccache
+
+source rapids-date-string
 
 export CMAKE_GENERATOR=Ninja
 
diff --git a/ci/build_python.sh b/ci/build_python.sh
index b79ba92b1..efb7bfe4a 100755
--- a/ci/build_python.sh
+++ b/ci/build_python.sh
@@ -3,7 +3,11 @@
 
 set -euo pipefail
 
-source rapids-env-update
+rapids-configure-conda-channels
+
+source rapids-configure-sccache
+
+source rapids-date-string
 
 export CMAKE_GENERATOR=Ninja
 

From 4a92d47c9580ede6f475f8ee382342013d08f092 Mon Sep 17 00:00:00 2001
From: Chuang Zhu <111838961+chuangz0@users.noreply.github.com>
Date: Fri, 19 Jan 2024 23:37:03 +0800
Subject: [PATCH 6/7] fix inferencesample option (#107)

fix inferencesample option

Authors:
  - Chuang Zhu (https://github.com/chuangz0)

Approvers:
  - Brad Rees (https://github.com/BradReesWork)

URL: https://github.com/rapidsai/wholegraph/pull/107
---
 cpp/src/wholememory_ops/functions/nvshmem_device_reference.cuh | 2 +-
 cpp/src/wholememory_ops/gather_op_impl_nvshmem.cu              | 1 +
 python/pylibwholegraph/pylibwholegraph/torch/common_options.py | 2 +-
 3 files changed, 3 insertions(+), 2 deletions(-)

diff --git a/cpp/src/wholememory_ops/functions/nvshmem_device_reference.cuh b/cpp/src/wholememory_ops/functions/nvshmem_device_reference.cuh
index d2d040a0e..5fa93ee12 100644
--- a/cpp/src/wholememory_ops/functions/nvshmem_device_reference.cuh
+++ b/cpp/src/wholememory_ops/functions/nvshmem_device_reference.cuh
@@ -29,7 +29,7 @@ class nvshmem_device_reference {
     : pointer_(static_cast<DataTypeT*>(nvshmem_ref.pointer)),
       typed_stride_(nvshmem_ref.stride / sizeof(DataTypeT))
   {
-    assert(gref.stride % sizeof(DataTypeT) == 0);
+    assert(nvshmem_ref.stride % sizeof(DataTypeT) == 0);
   }
 
   __device__ nvshmem_device_reference() = delete;
diff --git a/cpp/src/wholememory_ops/gather_op_impl_nvshmem.cu b/cpp/src/wholememory_ops/gather_op_impl_nvshmem.cu
index a860cbc6c..4051f12bd 100644
--- a/cpp/src/wholememory_ops/gather_op_impl_nvshmem.cu
+++ b/cpp/src/wholememory_ops/gather_op_impl_nvshmem.cu
@@ -185,6 +185,7 @@ wholememory_error_code_t wholememory_gather_nvshmem(
                                      p_env_fns,
                                      stream);
     // ungistre
+    WM_CUDA_CHECK(cudaStreamSynchronize(stream));
     if (nvshmemx_buffer_unregister(temp_output_ptr) != 0) {
       WHOLEMEMORY_ERROR("nvshmemx_buffer_unregister error in wholememory_gather_nvshmem");
     }
diff --git a/python/pylibwholegraph/pylibwholegraph/torch/common_options.py b/python/pylibwholegraph/pylibwholegraph/torch/common_options.py
index 0999fdfe5..42746add8 100644
--- a/python/pylibwholegraph/pylibwholegraph/torch/common_options.py
+++ b/python/pylibwholegraph/pylibwholegraph/torch/common_options.py
@@ -132,7 +132,7 @@ def add_common_sampler_options(argparser: ArgumentParser):
     argparser.add_argument(
         "-s",
         "--inferencesample",
-        type=int,
+        type=str,
         dest="inferencesample",
         default="30",
         help="inference sample count, -1 is all",

From ec609abe85760bc523c42e0e4d64ac4616181363 Mon Sep 17 00:00:00 2001
From: linhu-nv <141609318+linhu-nv@users.noreply.github.com>
Date: Fri, 19 Jan 2024 23:42:29 +0800
Subject: [PATCH 7/7] fix a bug for embedding optimizer, which leads to
 undefined behavior (#108)

Fix a bug for embedding optimizer, it leads to undefined behavior when embedding_dim is not multiple of 32.

Authors:
  - https://github.com/linhu-nv

Approvers:
  - Brad Rees (https://github.com/BradReesWork)

URL: https://github.com/rapidsai/wholegraph/pull/108
---
 .../functions/embedding_optimizer_func.cu            | 12 ++++++++----
 .../wholememory_embedding_gradient_apply_tests.cu    |  8 +++++---
 2 files changed, 13 insertions(+), 7 deletions(-)

diff --git a/cpp/src/wholememory_ops/functions/embedding_optimizer_func.cu b/cpp/src/wholememory_ops/functions/embedding_optimizer_func.cu
index e6d751280..0249ba1ac 100644
--- a/cpp/src/wholememory_ops/functions/embedding_optimizer_func.cu
+++ b/cpp/src/wholememory_ops/functions/embedding_optimizer_func.cu
@@ -214,7 +214,8 @@ __global__ void sgd_optimizer_step_kernel(const IndiceT* indices_ptr,
     int local_dim_idx = threadIdx.x;
     float grad_value  = 0.0f;
     int embedding_idx = local_dim_idx + loop_start_idx;
-    if (embedding_idx < embedding_dim) { grad_value = grads_ptr[embedding_idx]; }
+    if (embedding_idx >= embedding_dim) { break; }
+    grad_value            = grads_ptr[embedding_idx];
     float embedding_value = embedding_ptr[embedding_idx];
     grad_value += weight_decay * embedding_value;
     embedding_value -= lr * grad_value;
@@ -392,7 +393,8 @@ __global__ void lazy_adam_optimizer_step_kernel(const IndiceT* indices_ptr,
     int local_dim_idx = threadIdx.x;
     float grad_value  = 0.0f;
     int embedding_idx = local_dim_idx + loop_start_idx;
-    if (embedding_idx < embedding_dim) { grad_value = grads_ptr[local_dim_idx + loop_start_idx]; }
+    if (embedding_idx >= embedding_dim) { break; }
+    grad_value            = grads_ptr[local_dim_idx + loop_start_idx];
     float embedding_value = embedding_ptr[embedding_idx];
     if (AdamW) {
       embedding_value -= lr * weight_decay * embedding_value;
@@ -644,7 +646,8 @@ __global__ void ada_grad_optimizer_step_kernel(const IndiceT* indices_ptr,
     int local_dim_idx = threadIdx.x;
     float grad_value  = 0.0f;
     int embedding_idx = local_dim_idx + loop_start_idx;
-    if (embedding_idx < embedding_dim) { grad_value = grads_ptr[embedding_idx]; }
+    if (embedding_idx >= embedding_dim) { break; }
+    grad_value                   = grads_ptr[embedding_idx];
     float embedding_value        = embedding_ptr[embedding_idx];
     grad_value                   = grad_value + weight_decay * embedding_value;
     float state_sum              = state_sum_ptr[embedding_idx];
@@ -841,7 +844,8 @@ __global__ void rms_prop_optimizer_step_kernel(const IndiceT* indices_ptr,
     int local_dim_idx = threadIdx.x;
     float grad_value  = 0.0f;
     int embedding_idx = local_dim_idx + loop_start_idx;
-    if (embedding_idx < embedding_dim) { grad_value = grads_ptr[local_dim_idx + loop_start_idx]; }
+    if (embedding_idx >= embedding_dim) { break; }
+    grad_value                   = grads_ptr[local_dim_idx + loop_start_idx];
     float embedding_value        = embedding_ptr[embedding_idx];
     grad_value                   = grad_value + weight_decay * embedding_value;
     float v                      = v_ptr[embedding_idx];
diff --git a/cpp/tests/wholememory_ops/wholememory_embedding_gradient_apply_tests.cu b/cpp/tests/wholememory_ops/wholememory_embedding_gradient_apply_tests.cu
index 453b13b41..bb6360fc0 100644
--- a/cpp/tests/wholememory_ops/wholememory_embedding_gradient_apply_tests.cu
+++ b/cpp/tests/wholememory_ops/wholememory_embedding_gradient_apply_tests.cu
@@ -149,7 +149,7 @@ struct EmbeddingBackwardTestParams {
   wholememory_optimizer_type_t optimizer_type         = WHOLEMEMORY_OPT_SGD;
   float cache_ratio                                   = 0.2;
   bool use_cache                                      = false;
-  int run_count                                       = 1;
+  int run_count                                       = 3;
 
   float lr_ = 0.1;
 
@@ -428,7 +428,7 @@ void prepare_data_and_reference(
                    int64_t end_entry = (thread_rank + 1) * total_entry_count / thread_world_size;
                    CPUOptimizer cpu_optimizer(&params, start_entry, end_entry);
                    int embedding_dim = params.grad_description.sizes[1];
-                   for (int step = 0; step <= params.run_count; step++) {
+                   for (int step = 0; step < params.run_count; step++) {
                      int step_id = std::min(step, params.run_count - 1);
                      std::vector<int64_t> indices;
                      std::vector<std::vector<float>> grads;
@@ -625,7 +625,7 @@ TEST_P(WholeMemoryEmbeddingBackwardParameterTests, EmbeddingGatherGradientApplyT
       EXPECT_EQ(cudaStreamSynchronize(nullptr), cudaSuccess);
       EXPECT_EQ(wholememory_communicator_barrier(wm_comm), WHOLEMEMORY_SUCCESS);
 
-      for (int run = 0; run <= params.run_count; run++) {
+      for (int run = 0; run < params.run_count; run++) {
         int step_id            = std::min(run, params.run_count - 1);
         auto& rank_indices_vec = step_rank_indices[step_id][world_rank];
         auto& rank_grads_vec   = step_rank_grads[step_id][world_rank];
@@ -737,6 +737,8 @@ INSTANTIATE_TEST_SUITE_P(
         EmbeddingBackwardTestParams().set_use_cache().set_indice_count(10000127).set_optimizer_type(WHOLEMEMORY_OPT_ADAGRAD),
         EmbeddingBackwardTestParams().set_use_cache().set_indice_count(10000127).set_optimizer_type(WHOLEMEMORY_OPT_LAZY_ADAM),
 #endif
+    EmbeddingBackwardTestParams().set_entry_count(500).set_indice_count(400).set_embedding_dim(4),
+    EmbeddingBackwardTestParams().set_embedding_dim(3),
     EmbeddingBackwardTestParams().set_use_cache().set_grad_stride(131),
     EmbeddingBackwardTestParams().set_use_cache().set_grad_stride(131).set_optimizer_type(
       WHOLEMEMORY_OPT_RMSPROP),