Skip to content

Commit

Permalink
Merge branch 'feature/cuda' of github.com:GlobalArrays/ga into featur…
Browse files Browse the repository at this point in the history
…e/cuda
  • Loading branch information
Bruce J Palmer committed Jul 19, 2023
2 parents 1fb7157 + ea19640 commit fea04de
Show file tree
Hide file tree
Showing 7 changed files with 2,302 additions and 11 deletions.
1 change: 0 additions & 1 deletion cmake/ga-compiler-options.cmake
Original file line number Diff line number Diff line change
Expand Up @@ -7,7 +7,6 @@ if(CMAKE_C_COMPILER_ID MATCHES "Clang" OR CMAKE_C_COMPILER_ID MATCHES "IntelLLVM
else()
string( TOUPPER ${CMAKE_BUILD_TYPE} __cmbt_upper )
endif()
set(CMAKE_C_FLAGS_${__cmbt_upper} "-O3 -g")
endif()

if(CMAKE_C_COMPILER_ID STREQUAL "Clang" OR CMAKE_C_COMPILER_ID STREQUAL "IntelLLVM")
Expand Down
2 changes: 1 addition & 1 deletion comex/src-mpi-pr/comex.c
Original file line number Diff line number Diff line change
Expand Up @@ -58,7 +58,7 @@ sicm_device_list nill;
#define XSTR(x) #x
#define TR(x) XSTR(x)

#define XENABLE_GPU_AWARE_MPI
#define ENABLE_GPU_AWARE_MPI
#define ENABLE_STRIDED_KERNELS

#ifdef ENABLE_NVTX
Expand Down
6 changes: 0 additions & 6 deletions comex/src-mpi-pr/dev_utils.h
Original file line number Diff line number Diff line change
Expand Up @@ -6,8 +6,6 @@

#if defined(ENABLE_DEVICE)


#if defined(ENABLE_CUDA)
extern int numDevices();
extern void setDevice(int id);
extern void mallocDevice(void **buf, int size);
Expand All @@ -33,10 +31,6 @@ extern void parallelAccumulate(int op, void *src, int *src_stride, void *dst, in
extern int _comex_dev_flag;
extern int _comex_dev_id;

// #elif defined(ENABLE_HIP)
// #include "dev_utils_hip.hpp"
#endif

/**
* Skip the intense macro usage and just do this the old-fashion way
* @param op type of operation including data type
Expand Down
329 changes: 326 additions & 3 deletions comex/src-mpi-pr/dev_utils_hip.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -7,6 +7,8 @@
#include "dev_mem_handle.h"
#include <hip/hip_runtime_api.h>

#include "comex.h"

#define hipErrCheck(stat) \
{ \
hipErrCheck_((stat), __FILE__, __LINE__); \
Expand Down Expand Up @@ -155,7 +157,7 @@ void copyToDevice(void *devptr, void *hostptr, int bytes)
*/
void copyToHost(void *hostptr, void *devptr, int bytes)
{
hipError_t ierr = hipMemcpy(hostptr, devptr, bytes, hipMemcpyDeviceToHost);
hipError_t ierr = hipMemcpy(hostptr, devptr, bytes, hipMemcpyDeviceToHost);
hipErrCheck(ierr);
if (ierr != hipSuccess) {
hipPointerAttribute_t src_attr, dst_attr;
Expand Down Expand Up @@ -195,7 +197,7 @@ void copyToHost(void *hostptr, void *devptr, int bytes)
void copyDevToDev(void *dstptr, void *srcptr, int bytes)
{
hipError_t ierr;
ierr = hipMemcpy(dstptr, srcptr, bytes, hipMemcpyDeviceToDevice);
ierr = hipMemcpy(dstptr, srcptr, bytes, hipMemcpyDeviceToDevice);
hipErrCheck(ierr);
hipDeviceSynchronize();
if (ierr != hipSuccess) {
Expand Down Expand Up @@ -272,7 +274,7 @@ void deviceIaxpy(int *dst, int *src, const int *scale, int n)
hipPointerAttribute_t src_attr;
hipPointerAttribute_t dst_attr;
hipError_t perr = hipPointerGetAttributes(&src_attr, src);
if (perr != hipSuccess || src_attr.memoryType != hipMemoryTypeDevice {
if (perr != hipSuccess || src_attr.memoryType != hipMemoryTypeDevice) {
printf("p[%d] deviceIaxpy src pointer is on host\n",rank);
} else if (src_attr.memoryType == hipMemoryTypeDevice) {
printf("p[%d] deviceIaxpy src pointer is on device %d\n",rank,src_attr.device);
Expand Down Expand Up @@ -368,5 +370,326 @@ int deviceCloseMemHandle(void *memory)
return hipIpcCloseMemHandle(memory);
}

#define MAXDIM 7
struct strided_kernel_arg {
void *dst;
void *src;
int dst_strides[MAXDIM]; /* smallest strides are first */
int src_strides[MAXDIM];
int dims[MAXDIM]; /* dimensions of block being transferred */
int stride_levels; /* dimension of array minus 1 */
int elem_size; /* size of array elements */
int totalCopyElems; /* total constructs to copy */
int elements_per_block; /* number of elements copied by each thread */
int op; /* accumulate operation (if applicable) */
char scale[64]; /* accumulate scale parameter */
};

__global__ void strided_memcpy_kernel(strided_kernel_arg arg) {
int index = threadIdx.x;
int stride = blockIdx.x;

int i;
int idx[MAXDIM];
int currElem = 0;
int elements_per_block = arg.elements_per_block;
int bytes_per_thread = arg.elem_size*elements_per_block;
int stride_levels = arg.stride_levels;
int src_block_offset; /* Offset based on chunk_index */
int dst_block_offset; /* Offset based on chunk_index */

/* Determine location of chunk_index in array based
on the thread id and the block id */
index = index + stride * blockDim.x;
/* If the thread index is bigger than the total transfer
entities then this thread does not participate in the
copy */
if(index >= arg.totalCopyElems) {
return;
}
/* Find the indices that mark the location of this element within
the block of data that will be moved */
index *= elements_per_block;
// Calculate the index starting points
for (i=0; i<=stride_levels; i++) {
idx[i] = index%arg.dims[i];
index = (index-idx[i])/arg.dims[i];
}
/* Calculate the block offset for this thread */
src_block_offset = bytes_per_thread*idx[0];
dst_block_offset = bytes_per_thread*idx[0];
for (i=0; i<stride_levels; i++) {
src_block_offset += arg.src_strides[i]*idx[i+1]*bytes_per_thread;
dst_block_offset += arg.dst_strides[i]*idx[i+1]*bytes_per_thread;
}

/* Start copying element by element
TODO: Make it sure that it is continuous and replace the loop
with a single memcpy */
memcpy((char*)arg.dst + dst_block_offset + currElem * bytes_per_thread,
(char*)arg.src + src_block_offset + currElem * bytes_per_thread,
elements_per_block*bytes_per_thread);
/* Synchronize the threads before returning */
__syncthreads();
}

#define TTHREADS 1024
void parallelMemcpy(void *src, /* starting pointer of source data */
int *src_stride, /* strides of source data */
void *dst, /* starting pointer of destination data */
int *dst_stride, /* strides of destination data */
int *count, /* dimensions of data block to be transfered */
int stride_levels) /* number of stride levels */
{
int src_on_host = isHostPointer(src);
int dst_on_host = isHostPointer(dst);
void *msrc;
void *mdst;
int total_elems;
int nblocks;
strided_kernel_arg arg;
int i;

/* if src or dst is on host, map pointer to device */
if (src_on_host) {
/* Figure out how large segment of memory is. If this routine is being
called, stride_levels must be at least 1 */
int total = count[stride_levels]*src_stride[stride_levels-1];
hipHostRegister(src, total, hipHostRegisterMapped);
/* Register the host pointer */
hipHostGetDevicePointer (&msrc, src, 0);
} else {
msrc = src;
}
if (dst_on_host) {
/* Figure out how large segment of memory is. If this routine is being
called, stride_levels must be at least 1 */
int total = count[stride_levels]*dst_stride[stride_levels-1];
hipHostRegister(dst, total, hipHostRegisterMapped);
/* Register the host pointer */
hipHostGetDevicePointer (&mdst, dst, 0);
} else {
mdst = dst;
}

/* total_elems = count[0]/elem_size; */
total_elems = 1;
for (i=0; i<stride_levels; i++) total_elems *= count[i+1];

if(total_elems < TTHREADS){
nblocks = 1;
} else {
nblocks = int(ceil(((float)total_elems)/((float)TTHREADS)));
}

arg.dst = mdst;
arg.src = msrc;
arg.elem_size = count[0];
for (i=0; i<stride_levels; i++) {
arg.dst_strides[i] = dst_stride[i]/arg.elem_size;
arg.src_strides[i] = src_stride[i]/arg.elem_size;
}
for (i=0; i<=stride_levels; i++) arg.dims[i] = count[i];
arg.dims[0] = 1;
arg.stride_levels = stride_levels;
/* arg.elem_size = elem_size; */
arg.totalCopyElems = total_elems;
arg.elements_per_block = 1;

strided_memcpy_kernel<<<nblocks, TTHREADS>>>(arg);

/*
hipDeviceSynchoronize();
*/
if (src_on_host) {
hipHostUnregister(src);
}
if (dst_on_host) {
hipHostUnregister(dst);
}


}

__global__ void strided_accumulate_kernel(strided_kernel_arg arg) {
int index = threadIdx.x;
int stride = blockIdx.x;

int i;
int idx[MAXDIM];
int elements_per_block = arg.elements_per_block;
int bytes_per_thread = arg.elem_size*elements_per_block;
int stride_levels = arg.stride_levels;
int src_block_offset; /* Offset based on chunk_index */
int dst_block_offset; /* Offset based on chunk_index */
void *src, *dst;
int op;

/* Determine location of chunk_index in array based
on the thread id and the block id */
index = index + stride * blockDim.x;
/* If the thread index is bigger than the total transfer
entities then this thread does not participate in the
copy */
if(index >= arg.totalCopyElems) {
return;
}
/* Find the indices that mark the location of this element within
the block of data that will be moved */
// index *= elements_per_block;
// Calculate the index starting points
for (i=0; i<=stride_levels; i++) {
idx[i] = index%arg.dims[i];
index = (index-idx[i])/arg.dims[i];
}
/* Calculate the block offset for this thread */
src_block_offset = bytes_per_thread*idx[0];
dst_block_offset = bytes_per_thread*idx[0];
for (i=0; i<stride_levels; i++) {
src_block_offset += arg.src_strides[i]*idx[i+1];
dst_block_offset += arg.dst_strides[i]*idx[i+1];
}

/* Start copying element by element
TODO: Make it sure that it is continuous and replace the loop
with a single memcpy */
src = (void*)((char*)arg.src + src_block_offset);
dst = (void*)((char*)arg.dst + dst_block_offset);
op = arg.op;
if (op == COMEX_ACC_INT) {
int a = *((int*)src);
int scale = *((int*)arg.scale);
*((int*)dst) += a*scale;
} else if (op == COMEX_ACC_LNG) {
long a = *((long*)src);
long scale = *((long*)arg.scale);
*((long*)dst) += a*scale;
} else if (op == COMEX_ACC_FLT) {
float a = *((float*)src);
float scale = *((float*)arg.scale);
*((float*)dst) += a*scale;
} else if (op == COMEX_ACC_DBL) {
double a = *((double*)src);
double scale = *((double*)arg.scale);
*((double*)dst) += a*scale;
} else if (op == COMEX_ACC_CPL) {
float ar = *((float*)src);
float ai = *(((float*)src)+1);
float scaler = *((float*)arg.scale);
float scalei = *(((float*)arg.scale)+1);
*((float*)dst) += ar*scaler-ai*scalei;
*(((float*)dst)+1) += ar*scalei+ai*scaler;
} else if (op == COMEX_ACC_DCP) {
double ar = *((double*)src);
double ai = *(((double*)src)+1);
double scaler = *((double*)arg.scale);
double scalei = *(((double*)arg.scale)+1);
*((double*)dst) += ar*scaler-ai*scalei;
*(((double*)dst)+1) += ar*scalei+ai*scaler;
}
/* Synchronize the threads before returning */
__syncthreads();
}

void parallelAccumulate(int op, /* accumulate operation */
void *src, /* starting pointer of source data */
int *src_stride, /* strides of source data */
void *dst, /* starting pointer of destination data */
int *dst_stride, /* strides of destination data */
int *count, /* dimensions of data block to be transfered */
int stride_levels, /* number of stride levels */
void *scale) /* scale factor in accumulate */
{
int src_on_host = isHostPointer(src);
int dst_on_host = isHostPointer(dst);
void *msrc;
void *mdst;
int total_elems;
int elem_size;
int nblocks;
strided_kernel_arg arg;
int i;

/* if src or dst is on host, map pointer to device */
if (src_on_host) {
/* Figure out how large segment of memory is. If this routine is being
called, stride_levels must be at least 1 */
int total = count[stride_levels]*src_stride[stride_levels-1];
hipHostRegister(src, total, hipHostRegisterMapped);
/* Register the host pointer */
hipHostGetDevicePointer (&msrc, src, 0);
} else {
msrc = src;
}
if (dst_on_host) {
/* Figure out how large segment of memory is. If this routine is being
called, stride_levels must be at least 1 */
int total = count[stride_levels]*dst_stride[stride_levels-1];
hipHostRegister(dst, total, hipHostRegisterMapped);
/* Register the host pointer */
hipHostGetDevicePointer (&mdst, dst, 0);
} else {
mdst = dst;
}

/* total_elems = count[0]/elem_size; */
if (op == COMEX_ACC_INT) {
elem_size = sizeof(int);
*((int*)arg.scale) = *((int*)scale);
} else if (op == COMEX_ACC_LNG) {
elem_size = sizeof(long);
*((long*)arg.scale) = *((long*)scale);
} else if (op == COMEX_ACC_FLT) {
elem_size = sizeof(float);
*((float*)arg.scale) = *((float*)scale);
} else if (op == COMEX_ACC_DBL) {
elem_size = sizeof(double);
*((double*)arg.scale) = *((double*)scale);
} else if (op == COMEX_ACC_CPL) {
elem_size = 2*sizeof(float);
*((float*)arg.scale) = *((float*)scale);
*(((float*)arg.scale)+1) = *(((float*)scale)+1);
} else if (op == COMEX_ACC_DCP) {
elem_size = 2*sizeof(double);
*((double*)arg.scale) = *((double*)scale);
*(((double*)arg.scale)+1) = *(((double*)scale)+1);
}

total_elems = count[0]/elem_size;
for (i=0; i<stride_levels; i++) total_elems *= count[i+1];

if(total_elems < TTHREADS){
nblocks = 1;
} else {
nblocks = int(ceil(((float)total_elems)/((float)TTHREADS)));
}

arg.src = msrc;
arg.dst = mdst;
arg.elem_size = elem_size;
arg.op = op;
for (i=0; i<stride_levels; i++) {
arg.dst_strides[i] = dst_stride[i];
arg.src_strides[i] = src_stride[i];
}
for (i=0; i<=stride_levels; i++) arg.dims[i] = count[i];
arg.dims[0] = count[0]/elem_size;
arg.stride_levels = stride_levels;
/* arg.elem_size = elem_size; */
arg.totalCopyElems = total_elems;
arg.elements_per_block = 1;

strided_accumulate_kernel<<<nblocks, TTHREADS>>>(arg);

/*
hipDeviceSynchoronize();
*/
if (src_on_host) {
hipHostUnregister(src);
}
if (dst_on_host) {
hipHostUnregister(dst);
}
}
};

Loading

0 comments on commit fea04de

Please sign in to comment.