Merge branch 'feature/cuda' of github.com:GlobalArrays/ga into featur…

…e/cuda
GlobalArrays · Jul 19, 2023 · fea04de · fea04de
2 parents 1fb7157 + ea19640
commit fea04de
Show file tree

Hide file tree

Showing 7 changed files with 2,302 additions and 11 deletions.
diff --git a/cmake/ga-compiler-options.cmake b/cmake/ga-compiler-options.cmake
@@ -7,7 +7,6 @@ if(CMAKE_C_COMPILER_ID MATCHES "Clang" OR CMAKE_C_COMPILER_ID MATCHES "IntelLLVM
   else()
     string( TOUPPER ${CMAKE_BUILD_TYPE} __cmbt_upper )
   endif()
-  set(CMAKE_C_FLAGS_${__cmbt_upper} "-O3 -g")
 endif()
 
 if(CMAKE_C_COMPILER_ID STREQUAL "Clang" OR CMAKE_C_COMPILER_ID STREQUAL "IntelLLVM")

diff --git a/comex/src-mpi-pr/comex.c b/comex/src-mpi-pr/comex.c
@@ -58,7 +58,7 @@ sicm_device_list nill;
 #define XSTR(x) #x
 #define TR(x) XSTR(x)
 
-#define XENABLE_GPU_AWARE_MPI
+#define ENABLE_GPU_AWARE_MPI
 #define ENABLE_STRIDED_KERNELS
 
 #ifdef ENABLE_NVTX

diff --git a/comex/src-mpi-pr/dev_utils.h b/comex/src-mpi-pr/dev_utils.h
@@ -6,8 +6,6 @@
 
 #if defined(ENABLE_DEVICE)
 
-
-#if defined(ENABLE_CUDA)
 extern int numDevices();
 extern void setDevice(int id);
 extern void mallocDevice(void **buf, int size);
@@ -33,10 +31,6 @@ extern void parallelAccumulate(int op, void *src, int *src_stride, void *dst, in
 extern int _comex_dev_flag;
 extern int _comex_dev_id;
 
-// #elif defined(ENABLE_HIP)
-// #include "dev_utils_hip.hpp"
-#endif
-
 /**
  * Skip the intense macro usage and just do this the old-fashion way
  * @param op type of operation including data type

diff --git a/comex/src-mpi-pr/dev_utils_hip.cpp b/comex/src-mpi-pr/dev_utils_hip.cpp
@@ -7,6 +7,8 @@
 #include "dev_mem_handle.h"
 #include <hip/hip_runtime_api.h>
 
+#include "comex.h"
+
 #define hipErrCheck(stat)                     \
 {                                             \
   hipErrCheck_((stat), __FILE__, __LINE__);   \
@@ -155,7 +157,7 @@ void copyToDevice(void *devptr, void *hostptr, int bytes)
  */
 void copyToHost(void *hostptr, void *devptr, int bytes)
 {
-  hipError_t ierr = hipMemcpy(hostptr, devptr, bytes, hipMemcpyDeviceToHost); 
+  hipError_t ierr = hipMemcpy(hostptr, devptr, bytes, hipMemcpyDeviceToHost);
   hipErrCheck(ierr);
   if (ierr != hipSuccess) {
     hipPointerAttribute_t src_attr, dst_attr;
@@ -195,7 +197,7 @@ void copyToHost(void *hostptr, void *devptr, int bytes)
 void copyDevToDev(void *dstptr, void *srcptr, int bytes)
 {
   hipError_t ierr;
-  ierr = hipMemcpy(dstptr, srcptr, bytes, hipMemcpyDeviceToDevice); 
+  ierr = hipMemcpy(dstptr, srcptr, bytes, hipMemcpyDeviceToDevice);
   hipErrCheck(ierr);
   hipDeviceSynchronize();
   if (ierr != hipSuccess) {
@@ -272,7 +274,7 @@ void deviceIaxpy(int *dst, int *src, const int *scale, int n)
     hipPointerAttribute_t src_attr;
     hipPointerAttribute_t dst_attr;
     hipError_t  perr = hipPointerGetAttributes(&src_attr, src);
-    if (perr != hipSuccess || src_attr.memoryType != hipMemoryTypeDevice {
+    if (perr != hipSuccess || src_attr.memoryType != hipMemoryTypeDevice) {
       printf("p[%d] deviceIaxpy src pointer is on host\n",rank);
     } else if (src_attr.memoryType == hipMemoryTypeDevice)  {
       printf("p[%d] deviceIaxpy src pointer is on device %d\n",rank,src_attr.device);
@@ -368,5 +370,326 @@ int deviceCloseMemHandle(void *memory)
  return hipIpcCloseMemHandle(memory);
 }
 
+#define MAXDIM 7
+struct strided_kernel_arg {
+  void *dst;
+  void *src;
+  int dst_strides[MAXDIM];  /* smallest strides are first */
+  int src_strides[MAXDIM];
+  int dims[MAXDIM];         /* dimensions of block being transferred */
+  int stride_levels;        /* dimension of array minus 1 */
+  int elem_size;            /* size of array elements */
+  int totalCopyElems;       /* total constructs to copy */
+  int elements_per_block;   /* number of elements copied by each thread */
+  int op;                   /* accumulate operation (if applicable) */
+  char scale[64];           /* accumulate scale parameter */
+};
+
+__global__ void strided_memcpy_kernel(strided_kernel_arg arg) {
+  int index = threadIdx.x;
+  int stride = blockIdx.x;
+
+  int i;
+  int idx[MAXDIM];
+  int currElem = 0;
+  int elements_per_block = arg.elements_per_block;
+  int bytes_per_thread = arg.elem_size*elements_per_block;
+  int stride_levels = arg.stride_levels;
+  int src_block_offset; /* Offset based on chunk_index */
+  int dst_block_offset; /* Offset based on chunk_index */
+
+  /* Determine location of chunk_index in array based
+    on the thread id and the block id */
+  index = index + stride * blockDim.x;
+  /* If the thread index is bigger than the total transfer
+    entities then this thread does not participate in the
+    copy */
+  if(index >= arg.totalCopyElems) {
+     return;
+  }
+  /* Find the indices that mark the location of this element within
+     the block of data that will be moved */
+  index *= elements_per_block;
+  // Calculate the index starting points
+  for (i=0; i<=stride_levels; i++) {
+    idx[i] = index%arg.dims[i];
+    index = (index-idx[i])/arg.dims[i];
+  }
+  /* Calculate the block offset for this thread */
+  src_block_offset = bytes_per_thread*idx[0];
+  dst_block_offset = bytes_per_thread*idx[0];
+  for (i=0; i<stride_levels; i++) {
+    src_block_offset += arg.src_strides[i]*idx[i+1]*bytes_per_thread;
+    dst_block_offset += arg.dst_strides[i]*idx[i+1]*bytes_per_thread;
+  }
+
+  /* Start copying element by element
+     TODO: Make it sure that it is continuous and replace the loop
+     with a single memcpy */
+  memcpy((char*)arg.dst + dst_block_offset + currElem * bytes_per_thread,
+      (char*)arg.src + src_block_offset + currElem * bytes_per_thread,
+      elements_per_block*bytes_per_thread);
+  /* Synchronize the threads before returning  */
+  __syncthreads();
+}
+
+#define TTHREADS 1024
+void parallelMemcpy(void *src,         /* starting pointer of source data */
+                    int *src_stride,   /* strides of source data */
+                    void *dst,         /* starting pointer of destination data */
+                    int *dst_stride,   /* strides of destination data */
+                    int *count,        /* dimensions of data block to be transfered */
+                    int stride_levels) /* number of stride levels */
+{
+  int src_on_host = isHostPointer(src);
+  int dst_on_host = isHostPointer(dst);
+  void *msrc;
+  void *mdst;
+  int total_elems;
+  int nblocks;
+  strided_kernel_arg arg;
+  int i;
+
+  /* if src or dst is on host, map pointer to device */
+  if (src_on_host) {
+    /* Figure out how large segment of memory is. If this routine is being
+       called, stride_levels must be at least 1 */
+    int total = count[stride_levels]*src_stride[stride_levels-1];
+    hipHostRegister(src, total, hipHostRegisterMapped);
+    /* Register the host pointer */
+    hipHostGetDevicePointer (&msrc, src, 0);
+  } else {
+    msrc = src;
+  }
+  if (dst_on_host) {
+    /* Figure out how large segment of memory is. If this routine is being
+       called, stride_levels must be at least 1 */
+    int total = count[stride_levels]*dst_stride[stride_levels-1];
+    hipHostRegister(dst, total, hipHostRegisterMapped);
+    /* Register the host pointer */
+    hipHostGetDevicePointer (&mdst, dst, 0);
+  } else {
+    mdst = dst;
+  }
+
+  /* total_elems = count[0]/elem_size; */
+  total_elems = 1;
+  for (i=0; i<stride_levels; i++) total_elems *= count[i+1];
+
+  if(total_elems < TTHREADS){
+    nblocks = 1;
+  } else {
+    nblocks = int(ceil(((float)total_elems)/((float)TTHREADS)));
+  }
+
+  arg.dst = mdst;
+  arg.src = msrc;
+  arg.elem_size = count[0];
+  for (i=0; i<stride_levels; i++) {
+    arg.dst_strides[i] = dst_stride[i]/arg.elem_size;
+    arg.src_strides[i] = src_stride[i]/arg.elem_size;
+  }
+  for (i=0; i<=stride_levels; i++) arg.dims[i] = count[i];
+  arg.dims[0] = 1;
+  arg.stride_levels = stride_levels;
+  /* arg.elem_size = elem_size; */
+  arg.totalCopyElems = total_elems;
+  arg.elements_per_block = 1;
+
+  strided_memcpy_kernel<<<nblocks, TTHREADS>>>(arg);
+
+  /*
+  hipDeviceSynchoronize();
+  */
+  if (src_on_host) {
+    hipHostUnregister(src);
+  }
+  if (dst_on_host) {
+    hipHostUnregister(dst);
+  }
+
+
+}
+
+__global__ void strided_accumulate_kernel(strided_kernel_arg arg) {
+  int index = threadIdx.x;
+  int stride = blockIdx.x;
+
+  int i;
+  int idx[MAXDIM];
+  int elements_per_block = arg.elements_per_block;
+  int bytes_per_thread = arg.elem_size*elements_per_block;
+  int stride_levels = arg.stride_levels;
+  int src_block_offset; /* Offset based on chunk_index */
+  int dst_block_offset; /* Offset based on chunk_index */
+  void *src, *dst;
+  int op;
+
+  /* Determine location of chunk_index in array based
+    on the thread id and the block id */
+  index = index + stride * blockDim.x;
+  /* If the thread index is bigger than the total transfer
+    entities then this thread does not participate in the
+    copy */
+  if(index >= arg.totalCopyElems) {
+     return;
+  }
+  /* Find the indices that mark the location of this element within
+     the block of data that will be moved */
+  // index *= elements_per_block;
+  // Calculate the index starting points
+  for (i=0; i<=stride_levels; i++) {
+    idx[i] = index%arg.dims[i];
+    index = (index-idx[i])/arg.dims[i];
+  }
+  /* Calculate the block offset for this thread */
+  src_block_offset = bytes_per_thread*idx[0];
+  dst_block_offset = bytes_per_thread*idx[0];
+  for (i=0; i<stride_levels; i++) {
+    src_block_offset += arg.src_strides[i]*idx[i+1];
+    dst_block_offset += arg.dst_strides[i]*idx[i+1];
+  }
+
+  /* Start copying element by element
+     TODO: Make it sure that it is continuous and replace the loop
+     with a single memcpy */
+  src = (void*)((char*)arg.src + src_block_offset);
+  dst = (void*)((char*)arg.dst + dst_block_offset);
+  op = arg.op;
+  if (op == COMEX_ACC_INT) {
+    int a = *((int*)src);
+    int scale = *((int*)arg.scale);
+    *((int*)dst) += a*scale;
+  } else if (op == COMEX_ACC_LNG) {
+    long a = *((long*)src);
+    long scale = *((long*)arg.scale);
+    *((long*)dst) += a*scale;
+  } else if (op == COMEX_ACC_FLT) {
+    float a = *((float*)src);
+    float scale = *((float*)arg.scale);
+    *((float*)dst) += a*scale;
+  } else if (op == COMEX_ACC_DBL) {
+    double a = *((double*)src);
+    double scale = *((double*)arg.scale);
+    *((double*)dst) += a*scale;
+  } else if (op == COMEX_ACC_CPL) {
+    float ar = *((float*)src);
+    float ai = *(((float*)src)+1);
+    float scaler = *((float*)arg.scale);
+    float scalei = *(((float*)arg.scale)+1);
+    *((float*)dst) += ar*scaler-ai*scalei;
+    *(((float*)dst)+1) += ar*scalei+ai*scaler;
+  } else if (op == COMEX_ACC_DCP) {
+    double ar = *((double*)src);
+    double ai = *(((double*)src)+1);
+    double scaler = *((double*)arg.scale);
+    double scalei = *(((double*)arg.scale)+1);
+    *((double*)dst) += ar*scaler-ai*scalei;
+    *(((double*)dst)+1) += ar*scalei+ai*scaler;
+  }
+  /* Synchronize the threads before returning  */
+  __syncthreads();
+}
+
+void parallelAccumulate(int op,        /* accumulate operation */
+                    void *src,         /* starting pointer of source data */
+                    int *src_stride,   /* strides of source data */
+                    void *dst,         /* starting pointer of destination data */
+                    int *dst_stride,   /* strides of destination data */
+                    int *count,        /* dimensions of data block to be transfered */
+                    int stride_levels, /* number of stride levels */
+                    void *scale)       /* scale factor in accumulate */
+{
+  int src_on_host = isHostPointer(src);
+  int dst_on_host = isHostPointer(dst);
+  void *msrc;
+  void *mdst;
+  int total_elems;
+  int elem_size;
+  int nblocks;
+  strided_kernel_arg arg;
+  int i;
+
+  /* if src or dst is on host, map pointer to device */
+  if (src_on_host) {
+    /* Figure out how large segment of memory is. If this routine is being
+       called, stride_levels must be at least 1 */
+    int total = count[stride_levels]*src_stride[stride_levels-1];
+    hipHostRegister(src, total, hipHostRegisterMapped);
+    /* Register the host pointer */
+    hipHostGetDevicePointer (&msrc, src, 0);
+  } else {
+    msrc = src;
+  }
+  if (dst_on_host) {
+    /* Figure out how large segment of memory is. If this routine is being
+       called, stride_levels must be at least 1 */
+    int total = count[stride_levels]*dst_stride[stride_levels-1];
+    hipHostRegister(dst, total, hipHostRegisterMapped);
+    /* Register the host pointer */
+    hipHostGetDevicePointer (&mdst, dst, 0);
+  } else {
+    mdst = dst;
+  }
+
+  /* total_elems = count[0]/elem_size; */
+  if (op == COMEX_ACC_INT) {
+    elem_size = sizeof(int);
+    *((int*)arg.scale) = *((int*)scale);
+  } else if (op == COMEX_ACC_LNG) {
+    elem_size = sizeof(long);
+    *((long*)arg.scale) = *((long*)scale);
+  } else if (op == COMEX_ACC_FLT) {
+    elem_size = sizeof(float);
+    *((float*)arg.scale) = *((float*)scale);
+  } else if (op == COMEX_ACC_DBL) {
+    elem_size = sizeof(double);
+    *((double*)arg.scale) = *((double*)scale);
+  } else if (op == COMEX_ACC_CPL) {
+    elem_size = 2*sizeof(float);
+    *((float*)arg.scale) = *((float*)scale);
+    *(((float*)arg.scale)+1) = *(((float*)scale)+1);
+  } else if (op == COMEX_ACC_DCP) {
+    elem_size = 2*sizeof(double);
+    *((double*)arg.scale) = *((double*)scale);
+    *(((double*)arg.scale)+1) = *(((double*)scale)+1);
+  }
+
+  total_elems = count[0]/elem_size;
+  for (i=0; i<stride_levels; i++) total_elems *= count[i+1];
+
+  if(total_elems < TTHREADS){
+    nblocks = 1;
+  } else {
+    nblocks = int(ceil(((float)total_elems)/((float)TTHREADS)));
+  }
+
+  arg.src = msrc;
+  arg.dst = mdst;
+  arg.elem_size = elem_size;
+  arg.op = op;
+  for (i=0; i<stride_levels; i++) {
+    arg.dst_strides[i] = dst_stride[i];
+    arg.src_strides[i] = src_stride[i];
+  }
+  for (i=0; i<=stride_levels; i++) arg.dims[i] = count[i];
+  arg.dims[0] = count[0]/elem_size;
+  arg.stride_levels = stride_levels;
+  /* arg.elem_size = elem_size; */
+  arg.totalCopyElems = total_elems;
+  arg.elements_per_block = 1;
+
+  strided_accumulate_kernel<<<nblocks, TTHREADS>>>(arg);
+
+  /*
+  hipDeviceSynchoronize();
+  */
+  if (src_on_host) {
+    hipHostUnregister(src);
+  }
+  if (dst_on_host) {
+    hipHostUnregister(dst);
+  }
+}
 };
-Original file line number
+Diff line change
@@ Expand Up @@
       else()
         string( TOUPPER ${CMAKE_BUILD_TYPE} __cmbt_upper )
       endif()
-      set(CMAKE_C_FLAGS_${__cmbt_upper} "-O3 -g")
     endif()
     if(CMAKE_C_COMPILER_ID STREQUAL "Clang" OR CMAKE_C_COMPILER_ID STREQUAL "IntelLLVM")
@@ Expand Down @@