Skip to content

Commit

Permalink
Adding MI300A specific optimizations for GPU aware MPI.
Browse files Browse the repository at this point in the history
  • Loading branch information
PaulMullowney committed Dec 11, 2024
1 parent 5b369ae commit 79931dc
Show file tree
Hide file tree
Showing 10 changed files with 205 additions and 46 deletions.
1 change: 1 addition & 0 deletions src/trans/gpu/CMakeLists.txt
Original file line number Diff line number Diff line change
Expand Up @@ -11,6 +11,7 @@
list( APPEND trans_gpu_common_src
algor/ext_acc.F90
algor/c_hipmemgetinfo.cpp
algor/hip_allocator_mod.F90
algor/buffered_allocator_mod.F90
algor/device_mod.F90
algor/growing_allocator_mod.F90
Expand Down
55 changes: 53 additions & 2 deletions src/trans/gpu/algor/buffered_allocator_mod.F90
Original file line number Diff line number Diff line change
Expand Up @@ -17,7 +17,7 @@ MODULE BUFFERED_ALLOCATOR_MOD
IMPLICIT NONE

PRIVATE
PUBLIC :: BUFFERED_ALLOCATOR, ALLOCATION_RESERVATION_HANDLE, RESERVE, ASSIGN_PTR, GET_ALLOCATION
PUBLIC :: BUFFERED_ALLOCATOR, ALLOCATION_RESERVATION_HANDLE, RESERVE, RESERVE_GAM, ASSIGN_PTR, GET_ALLOCATION, GET_ALLOCATION_GAM
PUBLIC :: MAKE_BUFFERED_ALLOCATOR, INSTANTIATE_ALLOCATOR

! The buffered allocator uses double buffering. The idea is that the allocator
Expand All @@ -44,10 +44,19 @@ MODULE BUFFERED_ALLOCATOR_MOD
INTEGER(KIND=C_SIZE_T) :: BUFR_SZ(0:NBUF-1)
INTEGER(KIND=JPIM) :: NEXT_BUF
TYPE(GROWING_ALLOCATION_TYPE), POINTER :: PTR

! GPU Aware MPI versions
INTEGER(KIND=C_SIZE_T) :: GAM_BUFR_SZ(0:NBUF-1)
INTEGER(KIND=JPIM) :: GAM_NEXT_BUF
TYPE(GROWING_ALLOCATION_TYPE), POINTER :: GAM_PTR
END TYPE
TYPE ALLOCATION_RESERVATION_HANDLE
INTEGER(KIND=C_SIZE_T) :: SZ
INTEGER(KIND=JPIM) :: BUF

! GPU Aware MPI versions
INTEGER(KIND=C_SIZE_T) :: GAM_SZ
INTEGER(KIND=JPIM) :: GAM_BUF
END TYPE

INTERFACE ASSIGN_PTR
Expand All @@ -67,6 +76,9 @@ FUNCTION MAKE_BUFFERED_ALLOCATOR()

MAKE_BUFFERED_ALLOCATOR%BUFR_SZ(:) = 0
MAKE_BUFFERED_ALLOCATOR%NEXT_BUF = 0

MAKE_BUFFERED_ALLOCATOR%GAM_BUFR_SZ(:) = 0
MAKE_BUFFERED_ALLOCATOR%GAM_NEXT_BUF = 0
END FUNCTION MAKE_BUFFERED_ALLOCATOR

FUNCTION RESERVE(ALLOCATOR, SZ)
Expand All @@ -83,8 +95,22 @@ FUNCTION RESERVE(ALLOCATOR, SZ)
ALLOCATOR%NEXT_BUF = MOD(ALLOCATOR%NEXT_BUF+1,NBUF)
END FUNCTION RESERVE

FUNCTION RESERVE_GAM(ALLOCATOR, SZ)
IMPLICIT NONE
TYPE(BUFFERED_ALLOCATOR), INTENT(INOUT) :: ALLOCATOR
INTEGER(KIND=C_SIZE_T), INTENT(IN) :: SZ

TYPE(ALLOCATION_RESERVATION_HANDLE) :: RESERVE_GAM

ALLOCATOR%GAM_BUFR_SZ(ALLOCATOR%GAM_NEXT_BUF) = MAX(ALLOCATOR%GAM_BUFR_SZ(ALLOCATOR%GAM_NEXT_BUF),SZ)
RESERVE_GAM%GAM_BUF = ALLOCATOR%GAM_NEXT_BUF
RESERVE_GAM%GAM_SZ = SZ

ALLOCATOR%GAM_NEXT_BUF = MOD(ALLOCATOR%GAM_NEXT_BUF+1,NBUF)
END FUNCTION RESERVE_GAM

SUBROUTINE INSTANTIATE_ALLOCATOR(ALLOCATOR, GROWING_ALLOCATION)
USE GROWING_ALLOCATOR_MOD, ONLY: REALLOCATE_GROWING_ALLOCATION
USE GROWING_ALLOCATOR_MOD, ONLY: REALLOCATE_GROWING_ALLOCATION, REALLOCATE_GROWING_GAM_ALLOCATION
IMPLICIT NONE
TYPE(BUFFERED_ALLOCATOR), INTENT(INOUT) :: ALLOCATOR
!!TYPE(GROWING_ALLOCATION_TYPE), INTENT(IN), POINTER :: GROWING_ALLOCATION
Expand All @@ -97,6 +123,13 @@ SUBROUTINE INSTANTIATE_ALLOCATOR(ALLOCATOR, GROWING_ALLOCATION)
ALLOCATOR%PTR => GROWING_ALLOCATION

CALL REALLOCATE_GROWING_ALLOCATION(GROWING_ALLOCATION, SUM(ALLOCATOR%BUFR_SZ))

DO I = 0, NBUF-1
ALLOCATOR%GAM_BUFR_SZ(I) = ALIGN(ALLOCATOR%GAM_BUFR_SZ(I),128)
ENDDO
ALLOCATOR%GAM_PTR => GROWING_ALLOCATION

CALL REALLOCATE_GROWING_GAM_ALLOCATION(GROWING_ALLOCATION, SUM(ALLOCATOR%GAM_BUFR_SZ))
END SUBROUTINE

FUNCTION GET_ALLOCATION(ALLOCATOR, RESERVATION)
Expand All @@ -117,6 +150,24 @@ FUNCTION GET_ALLOCATION(ALLOCATOR, RESERVATION)
ENDIF
END FUNCTION GET_ALLOCATION

FUNCTION GET_ALLOCATION_GAM(ALLOCATOR, RESERVATION)
IMPLICIT NONE
TYPE(BUFFERED_ALLOCATOR), INTENT(IN) :: ALLOCATOR
TYPE(ALLOCATION_RESERVATION_HANDLE), INTENT(IN) :: RESERVATION

INTEGER(KIND=C_INT8_T), POINTER :: GET_ALLOCATION_GAM(:)

IF (RESERVATION%GAM_SZ > ALLOCATOR%GAM_BUFR_SZ(RESERVATION%GAM_BUF)) THEN
CALL ABORT_TRANS( "Logical Error in GET_ALLOCATION_GAM")
ENDIF
IF (RESERVATION%GAM_BUF == 0) THEN
GET_ALLOCATION_GAM(1:) => ALLOCATOR%GAM_PTR%GAM_PTR(1:RESERVATION%GAM_SZ)
ELSE
GET_ALLOCATION_GAM(1:) => ALLOCATOR%GAM_PTR%GAM_PTR(SUM(ALLOCATOR%GAM_BUFR_SZ(0:RESERVATION%GAM_BUF-1))+1: &
SUM(ALLOCATOR%GAM_BUFR_SZ(0:RESERVATION%GAM_BUF-1))+RESERVATION%GAM_SZ)
ENDIF
END FUNCTION GET_ALLOCATION_GAM

SUBROUTINE ASSIGN_PTR_FLOAT(DST, SRC, START_IN_BYTES, LENGTH_IN_BYTES, SET_VALUE, SET_STREAM)
USE ISO_C_BINDING, ONLY: C_FLOAT, C_F_POINTER, C_SIZEOF
IMPLICIT NONE
Expand Down
52 changes: 49 additions & 3 deletions src/trans/gpu/algor/growing_allocator_mod.F90
Original file line number Diff line number Diff line change
@@ -1,15 +1,16 @@
MODULE GROWING_ALLOCATOR_MOD

USE ISO_C_BINDING, ONLY: C_INT8_T
USE HIP_ALLOCATOR_MOD
USE ISO_C_BINDING, ONLY: C_INT8_T, C_PTR

PRIVATE
PUBLIC :: GROWING_ALLOCATION_TYPE
PUBLIC :: REALLOCATE_GROWING_ALLOCATION, REGISTER_FREE_FUNCTION
PUBLIC :: REALLOCATE_GROWING_ALLOCATION, REGISTER_FREE_FUNCTION, REALLOCATE_GROWING_GAM_ALLOCATION
PUBLIC :: DESTROY_GROWING_ALLOCATOR

ABSTRACT INTERFACE
SUBROUTINE FREE_FUNC_PROC(PTR, SZ) BIND(C)
USE ISO_C_BINDING, ONLY: C_SIZE_T, C_INT8_T
USE ISO_C_BINDING, ONLY: C_SIZE_T, C_INT8_T, C_PTR
IMPLICIT NONE
INTEGER(KIND=C_INT8_T), TARGET :: PTR(:)
INTEGER(C_SIZE_T), VALUE :: SZ
Expand All @@ -21,9 +22,13 @@ SUBROUTINE FREE_FUNC_PROC(PTR, SZ) BIND(C)
END TYPE

TYPE GROWING_ALLOCATION_TYPE
! Regular allocations
INTEGER(KIND=C_INT8_T), POINTER :: PTR(:)
TYPE(FREE_FUNC_TYPE) :: FREE_FUNCS(10)
INTEGER :: FREE_FUNCS_SZ
! GPU aware MPI weirdness
INTEGER(KIND=C_INT8_T), POINTER :: GAM_PTR(:)
INTEGER(KIND=C_INT8_T), POINTER :: GAM_DEV_PTR(:)
END TYPE

CONTAINS
Expand All @@ -48,6 +53,47 @@ SUBROUTINE REALLOCATE_GROWING_ALLOCATION(ALLOC, SZ)
ENDIF
END SUBROUTINE

SUBROUTINE REALLOCATE_GROWING_GAM_ALLOCATION(ALLOC, SZ)
USE ISO_C_BINDING
USE OPENACC
USE TPM_GEN, ONLY: NOUT
USE HIP_ALLOCATOR_MOD, ONLY: DEVICE_ALLOCATE, DEVICE_FREE
IMPLICIT NONE
TYPE(GROWING_ALLOCATION_TYPE), INTENT(INOUT) :: ALLOC
INTEGER(C_SIZE_T) :: SZ
INTEGER :: I

! Deallocate existing pointer
IF (ASSOCIATED(ALLOC%GAM_PTR) .AND. SZ > SIZE(ALLOC%GAM_PTR, 1, C_SIZE_T)) THEN
WRITE(NOUT,*) "WARNING: REALLOCATING GROWING POINTER CAUSING GRAPH REINSTANTIATION"
DO I = 1, ALLOC%FREE_FUNCS_SZ
CALL ALLOC%FREE_FUNCS(I)%FUNC(ALLOC%GAM_PTR, &
SIZE(ALLOC%GAM_PTR, 1, C_SIZE_T))
ENDDO
#ifdef __HIP_PLATFORM_AMD__
CALL DEVICE_FREE(ALLOC%GAM_DEV_PTR)
#else
!$ACC EXIT DATA DELETE(ALLOC%GAM_PTR)
DEALLOCATE(ALLOC%GAM_PTR)
#endif
NULLIFY(ALLOC%GAM_PTR)
ENDIF

IF (.NOT. ASSOCIATED(ALLOC%GAM_PTR)) THEN
#ifdef __HIP_PLATFORM_AMD__
! This should be moved to an ACC_MALLOC or something similar but it doesn't seem to work.
CALL DEVICE_ALLOCATE(ALLOC%GAM_DEV_PTR,SZ)
!ALLOC%GAM_DEV_PTR = ACC_MALLOC(SZ)
CALL ACC_MAP_DATA(ALLOC%GAM_DEV_PTR, C_LOC(ALLOC%GAM_DEV_PTR),SZ)
CALL C_F_POINTER(C_LOC(ALLOC%GAM_DEV_PTR), ALLOC%GAM_PTR, [SZ])
#else
ALLOCATE(ALLOC%GAM_PTR(SZ))
!$ACC ENTER DATA CREATE(ALLOC%GAM_PTR)
#endif
ALLOC%FREE_FUNCS_SZ = 0
ENDIF
END SUBROUTINE

SUBROUTINE REGISTER_FREE_FUNCTION(ALLOC, FREE_FUNC)
USE ABORT_TRANS_MOD, ONLY: ABORT_TRANS
IMPLICIT NONE
Expand Down
48 changes: 48 additions & 0 deletions src/trans/gpu/algor/hip_allocator_mod.F90
Original file line number Diff line number Diff line change
@@ -0,0 +1,48 @@
MODULE HIP_ALLOCATOR_MOD
USE ISO_C_BINDING

IMPLICIT NONE
SAVE
PRIVATE

PUBLIC :: DEVICE_ALLOCATE, DEVICE_FREE

INTERFACE
SUBROUTINE HIPMALLOC(CPTR, PSIZE) BIND(C, NAME="hipMalloc")
USE ISO_C_BINDING, ONLY : C_PTR, C_SIZE_T
IMPLICIT NONE
TYPE(C_PTR) :: CPTR
INTEGER(C_SIZE_T), VALUE :: PSIZE
END SUBROUTINE HIPMALLOC

SUBROUTINE HIPFREE(PTR) BIND(C, NAME="hipFree")
USE ISO_C_BINDING, ONLY : C_PTR
IMPLICIT NONE
TYPE(C_PTR) :: PTR
END SUBROUTINE HIPFREE
END INTERFACE

CONTAINS

SUBROUTINE DEVICE_ALLOCATE(X, PSIZE)
USE ISO_C_BINDING, ONLY : C_PTR, C_SIZE_T, C_INT8_T
IMPLICIT NONE
INTEGER(C_INT8_T), DIMENSION(:), POINTER, INTENT(INOUT) :: X
INTEGER(C_SIZE_T), VALUE :: PSIZE
TYPE(C_PTR) :: PTR
PTR = C_LOC(X)
CALL HIPMALLOC(PTR, PSIZE)
CALL C_F_POINTER(PTR, X, [PSIZE])

END SUBROUTINE DEVICE_ALLOCATE

SUBROUTINE DEVICE_FREE(X)
USE ISO_C_BINDING, ONLY : C_PTR, C_INT8_T
IMPLICIT NONE
INTEGER(C_INT8_T), DIMENSION(:), POINTER, INTENT(INOUT) :: X
TYPE(C_PTR) :: PTR
PTR = C_LOC(X)
CALL HIPFREE(PTR)
END SUBROUTINE DEVICE_FREE

END MODULE HIP_ALLOCATOR_MOD
38 changes: 23 additions & 15 deletions src/trans/gpu/internal/trgtol_mod.F90
Original file line number Diff line number Diff line change
Expand Up @@ -12,19 +12,24 @@

MODULE TRGTOL_MOD
USE BUFFERED_ALLOCATOR_MOD, ONLY: ALLOCATION_RESERVATION_HANDLE
USE ISO_C_BINDING, ONLY: C_SIZE_T
IMPLICIT NONE

PRIVATE
PUBLIC :: TRGTOL_HANDLE, TRGTOL, PREPARE_TRGTOL

TYPE TRGTOL_HANDLE
TYPE(ALLOCATION_RESERVATION_HANDLE) :: HCOMBUFS, HCOMBUFR_AND_REEL
TYPE(ALLOCATION_RESERVATION_HANDLE) :: HCOMBUFS_COMBUFR
TYPE(ALLOCATION_RESERVATION_HANDLE) :: HREEL
INTEGER(KIND=C_SIZE_T) :: COMBUFS_START
INTEGER(KIND=C_SIZE_T) :: COMBUFR_START
INTEGER(KIND=C_SIZE_T) :: REEL_START
END TYPE
CONTAINS
FUNCTION PREPARE_TRGTOL(ALLOCATOR,KF_GP,KF_FS) RESULT(HTRGTOL)
USE PARKIND_ECTRANS, ONLY: JPIM, JPRB, JPRBT
USE TPM_DISTR, ONLY: D
USE BUFFERED_ALLOCATOR_MOD, ONLY: BUFFERED_ALLOCATOR, RESERVE
USE BUFFERED_ALLOCATOR_MOD, ONLY: BUFFERED_ALLOCATOR, RESERVE, RESERVE_GAM
USE ISO_C_BINDING, ONLY: C_SIZE_T, C_SIZEOF

IMPLICIT NONE
Expand All @@ -37,11 +42,16 @@ FUNCTION PREPARE_TRGTOL(ALLOCATOR,KF_GP,KF_FS) RESULT(HTRGTOL)

INTEGER(KIND=C_SIZE_T) :: NELEM

HTRGTOL%HCOMBUFS = RESERVE(ALLOCATOR, INT(KF_GP*D%NGPTOT,KIND=C_SIZE_T)*C_SIZEOF(DUMMY))
HTRGTOL%COMBUFS_START = 1
NELEM = ALIGN(INT(KF_GP*D%NGPTOT,KIND=C_SIZE_T)*C_SIZEOF(DUMMY), 128)

NELEM = INT(KF_FS*D%NLENGTF,KIND=C_SIZE_T)*C_SIZEOF(DUMMY) ! ZCOMBUFR
NELEM = NELEM + INT(KF_FS*D%NLENGTF,KIND=C_SIZE_T)*C_SIZEOF(DUMMY) ! PREEL_REAL
HTRGTOL%HCOMBUFR_AND_REEL = RESERVE(ALLOCATOR, NELEM)
HTRGTOL%COMBUFR_START = NELEM + 1
NELEM = NELEM + ALIGN(INT(KF_FS*D%NLENGTF,KIND=C_SIZE_T)*C_SIZEOF(DUMMY),128)
HTRGTOL%HCOMBUFS_COMBUFR = RESERVE_GAM(ALLOCATOR, NELEM)

HTRGTOL%REEL_START = 1
NELEM = ALIGN(INT(KF_FS*D%NLENGTF,KIND=C_SIZE_T)*C_SIZEOF(DUMMY),128) ! PREEL_REAL
HTRGTOL%HREEL = RESERVE(ALLOCATOR, NELEM)
END FUNCTION PREPARE_TRGTOL

SUBROUTINE TRGTOL(ALLOCATOR,HTRGTOL,PREEL_REAL,KF_FS,KF_GP,KF_UV_G,KF_SCALARS_G,&
Expand Down Expand Up @@ -120,7 +130,7 @@ SUBROUTINE TRGTOL(ALLOCATOR,HTRGTOL,PREEL_REAL,KF_FS,KF_GP,KF_UV_G,KF_SCALARS_G,
USE TPM_STATS, ONLY: GSTATS => GSTATS_NVTX
USE TPM_TRANS, ONLY: NPROMA
USE ISO_C_BINDING, ONLY: C_SIZE_T, C_FLOAT, C_DOUBLE, C_INT8_T, C_SIZEOF
USE BUFFERED_ALLOCATOR_MOD, ONLY: BUFFERED_ALLOCATOR, ASSIGN_PTR, GET_ALLOCATION
USE BUFFERED_ALLOCATOR_MOD, ONLY: BUFFERED_ALLOCATOR, ASSIGN_PTR, GET_ALLOCATION, GET_ALLOCATION_GAM
USE OPENACC_EXT, ONLY: EXT_ACC_ARR_DESC, EXT_ACC_PASS, EXT_ACC_CREATE, &
& EXT_ACC_DELETE
USE OPENACC, ONLY: ACC_HANDLE_KIND
Expand Down Expand Up @@ -326,10 +336,8 @@ SUBROUTINE TRGTOL(ALLOCATOR,HTRGTOL,PREEL_REAL,KF_FS,KF_GP,KF_UV_G,KF_SCALARS_G,
ENDDO

block
CALL ASSIGN_PTR(PREEL_REAL, GET_ALLOCATION(ALLOCATOR, HTRGTOL%HCOMBUFR_AND_REEL),&
& INT(KF_FS*D%NLENGTF,KIND=C_SIZE_T)*C_SIZEOF(PREEL_REAL(1))+1_C_SIZE_T, &
& INT(KF_FS*D%NLENGTF,KIND=C_SIZE_T)*C_SIZEOF(PREEL_REAL(1)))
!!CALL ASSIGN_PTR(PREEL_REAL, GET_ALLOCATION(ALLOCATOR, HTRGTOL%HCOMBUFR_AND_REEL), size1, size2)
CALL ASSIGN_PTR(PREEL_REAL, GET_ALLOCATION(ALLOCATOR, HTRGTOL%HREEL),&
& HTRGTOL%REEL_START, INT(KF_FS*D%NLENGTF,KIND=C_SIZE_T)*C_SIZEOF(PREEL_REAL(1)))
end block

#ifdef OMPGPU
Expand Down Expand Up @@ -458,8 +466,8 @@ SUBROUTINE TRGTOL(ALLOCATOR,HTRGTOL,PREEL_REAL,KF_FS,KF_GP,KF_UV_G,KF_SCALARS_G,
ENDDO

IF (ISEND_COUNTS > 0) THEN
CALL ASSIGN_PTR(ZCOMBUFS, GET_ALLOCATION(ALLOCATOR, HTRGTOL%HCOMBUFS),&
& 1_C_SIZE_T, INT(ICOMBUFS_OFFSET(ISEND_COUNTS+1),KIND=C_SIZE_T)*C_SIZEOF(ZCOMBUFS(1)))
CALL ASSIGN_PTR(ZCOMBUFS, GET_ALLOCATION_GAM(ALLOCATOR, HTRGTOL%HCOMBUFS_COMBUFR),&
& HTRGTOL%COMBUFS_START, INT(ICOMBUFS_OFFSET(ISEND_COUNTS+1),KIND=C_SIZE_T)*C_SIZEOF(ZCOMBUFS(1)))
ENDIF

!....Pack loop.........................................................
Expand Down Expand Up @@ -567,8 +575,8 @@ SUBROUTINE TRGTOL(ALLOCATOR,HTRGTOL,PREEL_REAL,KF_FS,KF_GP,KF_UV_G,KF_SCALARS_G,

CALL GSTATS(411,0)
IF (IRECV_COUNTS > 0) THEN
CALL ASSIGN_PTR(ZCOMBUFR, GET_ALLOCATION(ALLOCATOR, HTRGTOL%HCOMBUFR_AND_REEL),&
& 1_C_SIZE_T, INT(ICOMBUFR_OFFSET(IRECV_COUNTS+1),KIND=C_SIZE_T)*C_SIZEOF(ZCOMBUFR(1)))
CALL ASSIGN_PTR(ZCOMBUFR, GET_ALLOCATION_GAM(ALLOCATOR, HTRGTOL%HCOMBUFS_COMBUFR),&
& HTRGTOL%COMBUFR_START, INT(ICOMBUFR_OFFSET(IRECV_COUNTS+1),KIND=C_SIZE_T)*C_SIZEOF(ZCOMBUFR(1)))
ENDIF
#ifdef OMPGPU
#endif
Expand Down
Loading

0 comments on commit 79931dc

Please sign in to comment.