Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

WIP : MI300A Support #170

Open
wants to merge 1 commit into
base: develop
Choose a base branch
from
Open
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
1 change: 1 addition & 0 deletions src/trans/gpu/CMakeLists.txt
Original file line number Diff line number Diff line change
Expand Up @@ -11,6 +11,7 @@
list( APPEND trans_gpu_common_src
algor/ext_acc.F90
algor/c_hipmemgetinfo.cpp
algor/hip_allocator_mod.F90
algor/buffered_allocator_mod.F90
algor/device_mod.F90
algor/growing_allocator_mod.F90
Expand Down
56 changes: 54 additions & 2 deletions src/trans/gpu/algor/buffered_allocator_mod.F90
Original file line number Diff line number Diff line change
Expand Up @@ -17,7 +17,7 @@ MODULE BUFFERED_ALLOCATOR_MOD
IMPLICIT NONE

PRIVATE
PUBLIC :: BUFFERED_ALLOCATOR, ALLOCATION_RESERVATION_HANDLE, RESERVE, ASSIGN_PTR, GET_ALLOCATION
PUBLIC :: BUFFERED_ALLOCATOR, ALLOCATION_RESERVATION_HANDLE, RESERVE, RESERVE_GAM, ASSIGN_PTR, GET_ALLOCATION, GET_ALLOCATION_GAM
PUBLIC :: MAKE_BUFFERED_ALLOCATOR, INSTANTIATE_ALLOCATOR

! The buffered allocator uses double buffering. The idea is that the allocator
Expand All @@ -44,10 +44,19 @@ MODULE BUFFERED_ALLOCATOR_MOD
INTEGER(KIND=C_SIZE_T) :: BUFR_SZ(0:NBUF-1)
INTEGER(KIND=JPIM) :: NEXT_BUF
TYPE(GROWING_ALLOCATION_TYPE), POINTER :: PTR

! GPU Aware MPI versions
INTEGER(KIND=C_SIZE_T) :: GAM_BUFR_SZ(0:NBUF-1)
INTEGER(KIND=JPIM) :: GAM_NEXT_BUF
TYPE(GROWING_ALLOCATION_TYPE), POINTER :: GAM_PTR
END TYPE
TYPE ALLOCATION_RESERVATION_HANDLE
INTEGER(KIND=C_SIZE_T) :: SZ
INTEGER(KIND=JPIM) :: BUF

! GPU Aware MPI versions
INTEGER(KIND=C_SIZE_T) :: GAM_SZ
INTEGER(KIND=JPIM) :: GAM_BUF
END TYPE

INTERFACE ASSIGN_PTR
Expand All @@ -67,6 +76,9 @@ FUNCTION MAKE_BUFFERED_ALLOCATOR()

MAKE_BUFFERED_ALLOCATOR%BUFR_SZ(:) = 0
MAKE_BUFFERED_ALLOCATOR%NEXT_BUF = 0

MAKE_BUFFERED_ALLOCATOR%GAM_BUFR_SZ(:) = 0
MAKE_BUFFERED_ALLOCATOR%GAM_NEXT_BUF = 0
END FUNCTION MAKE_BUFFERED_ALLOCATOR

FUNCTION RESERVE(ALLOCATOR, SZ, WHO)
Expand All @@ -84,8 +96,23 @@ FUNCTION RESERVE(ALLOCATOR, SZ, WHO)
ALLOCATOR%NEXT_BUF = MOD(ALLOCATOR%NEXT_BUF+1,NBUF)
END FUNCTION RESERVE

FUNCTION RESERVE_GAM(ALLOCATOR, SZ, WHO)
IMPLICIT NONE
TYPE(BUFFERED_ALLOCATOR), INTENT(INOUT) :: ALLOCATOR
INTEGER(KIND=C_SIZE_T), INTENT(IN) :: SZ
CHARACTER(*), INTENT(IN), OPTIONAL :: WHO

TYPE(ALLOCATION_RESERVATION_HANDLE) :: RESERVE_GAM

ALLOCATOR%GAM_BUFR_SZ(ALLOCATOR%GAM_NEXT_BUF) = MAX(ALLOCATOR%GAM_BUFR_SZ(ALLOCATOR%GAM_NEXT_BUF),SZ)
RESERVE_GAM%GAM_BUF = ALLOCATOR%GAM_NEXT_BUF
RESERVE_GAM%GAM_SZ = SZ

ALLOCATOR%GAM_NEXT_BUF = MOD(ALLOCATOR%GAM_NEXT_BUF+1,NBUF)
END FUNCTION RESERVE_GAM

SUBROUTINE INSTANTIATE_ALLOCATOR(ALLOCATOR, GROWING_ALLOCATION)
USE GROWING_ALLOCATOR_MOD, ONLY: REALLOCATE_GROWING_ALLOCATION
USE GROWING_ALLOCATOR_MOD, ONLY: REALLOCATE_GROWING_ALLOCATION, REALLOCATE_GROWING_GAM_ALLOCATION
IMPLICIT NONE
TYPE(BUFFERED_ALLOCATOR), INTENT(INOUT) :: ALLOCATOR
!!TYPE(GROWING_ALLOCATION_TYPE), INTENT(IN), POINTER :: GROWING_ALLOCATION
Expand All @@ -98,6 +125,13 @@ SUBROUTINE INSTANTIATE_ALLOCATOR(ALLOCATOR, GROWING_ALLOCATION)
ALLOCATOR%PTR => GROWING_ALLOCATION

CALL REALLOCATE_GROWING_ALLOCATION(GROWING_ALLOCATION, SUM(ALLOCATOR%BUFR_SZ))

DO I = 0, NBUF-1
ALLOCATOR%GAM_BUFR_SZ(I) = ALIGN(ALLOCATOR%GAM_BUFR_SZ(I),128)
ENDDO
ALLOCATOR%GAM_PTR => GROWING_ALLOCATION

CALL REALLOCATE_GROWING_GAM_ALLOCATION(GROWING_ALLOCATION, SUM(ALLOCATOR%GAM_BUFR_SZ))
END SUBROUTINE

FUNCTION GET_ALLOCATION(ALLOCATOR, RESERVATION)
Expand All @@ -118,6 +152,24 @@ FUNCTION GET_ALLOCATION(ALLOCATOR, RESERVATION)
ENDIF
END FUNCTION GET_ALLOCATION

FUNCTION GET_ALLOCATION_GAM(ALLOCATOR, RESERVATION)
IMPLICIT NONE
TYPE(BUFFERED_ALLOCATOR), INTENT(IN) :: ALLOCATOR
TYPE(ALLOCATION_RESERVATION_HANDLE), INTENT(IN) :: RESERVATION

INTEGER(KIND=C_INT8_T), POINTER :: GET_ALLOCATION_GAM(:)

IF (RESERVATION%GAM_SZ > ALLOCATOR%GAM_BUFR_SZ(RESERVATION%GAM_BUF)) THEN
CALL ABORT_TRANS( "Logical Error in GET_ALLOCATION_GAM")
ENDIF
IF (RESERVATION%GAM_BUF == 0) THEN
GET_ALLOCATION_GAM(1:) => ALLOCATOR%GAM_PTR%GAM_PTR(1:RESERVATION%GAM_SZ)
ELSE
GET_ALLOCATION_GAM(1:) => ALLOCATOR%GAM_PTR%GAM_PTR(SUM(ALLOCATOR%GAM_BUFR_SZ(0:RESERVATION%GAM_BUF-1))+1: &
SUM(ALLOCATOR%GAM_BUFR_SZ(0:RESERVATION%GAM_BUF-1))+RESERVATION%GAM_SZ)
ENDIF
END FUNCTION GET_ALLOCATION_GAM

SUBROUTINE ASSIGN_PTR_FLOAT(DST, SRC, START_IN_BYTES, LENGTH_IN_BYTES, SET_VALUE, SET_STREAM)
USE ISO_C_BINDING, ONLY: C_FLOAT, C_F_POINTER, C_SIZEOF
IMPLICIT NONE
Expand Down
52 changes: 49 additions & 3 deletions src/trans/gpu/algor/growing_allocator_mod.F90
Original file line number Diff line number Diff line change
@@ -1,15 +1,16 @@
MODULE GROWING_ALLOCATOR_MOD

USE ISO_C_BINDING, ONLY: C_INT8_T
USE HIP_ALLOCATOR_MOD
USE ISO_C_BINDING, ONLY: C_INT8_T, C_PTR

PRIVATE
PUBLIC :: GROWING_ALLOCATION_TYPE
PUBLIC :: REALLOCATE_GROWING_ALLOCATION, REGISTER_FREE_FUNCTION
PUBLIC :: REALLOCATE_GROWING_ALLOCATION, REGISTER_FREE_FUNCTION, REALLOCATE_GROWING_GAM_ALLOCATION
PUBLIC :: DESTROY_GROWING_ALLOCATOR

ABSTRACT INTERFACE
SUBROUTINE FREE_FUNC_PROC(PTR, SZ) BIND(C)
USE ISO_C_BINDING, ONLY: C_SIZE_T, C_INT8_T
USE ISO_C_BINDING, ONLY: C_SIZE_T, C_INT8_T, C_PTR
IMPLICIT NONE
INTEGER(KIND=C_INT8_T), TARGET :: PTR(:)
INTEGER(C_SIZE_T), VALUE :: SZ
Expand All @@ -21,9 +22,13 @@ SUBROUTINE FREE_FUNC_PROC(PTR, SZ) BIND(C)
END TYPE

TYPE GROWING_ALLOCATION_TYPE
! Regular allocations
INTEGER(KIND=C_INT8_T), POINTER :: PTR(:)
TYPE(FREE_FUNC_TYPE) :: FREE_FUNCS(10)
INTEGER :: FREE_FUNCS_SZ
! GPU aware MPI weirdness
INTEGER(KIND=C_INT8_T), POINTER :: GAM_PTR(:)
INTEGER(KIND=C_INT8_T), POINTER :: GAM_DEV_PTR(:)
END TYPE

CONTAINS
Expand All @@ -48,6 +53,47 @@ SUBROUTINE REALLOCATE_GROWING_ALLOCATION(ALLOC, SZ)
ENDIF
END SUBROUTINE

SUBROUTINE REALLOCATE_GROWING_GAM_ALLOCATION(ALLOC, SZ)
USE ISO_C_BINDING
USE OPENACC
USE TPM_GEN, ONLY: NOUT
USE HIP_ALLOCATOR_MOD, ONLY: DEVICE_ALLOCATE, DEVICE_FREE
IMPLICIT NONE
TYPE(GROWING_ALLOCATION_TYPE), INTENT(INOUT) :: ALLOC
INTEGER(C_SIZE_T) :: SZ
INTEGER :: I

! Deallocate existing pointer
IF (ASSOCIATED(ALLOC%GAM_PTR) .AND. SZ > SIZE(ALLOC%GAM_PTR, 1, C_SIZE_T)) THEN
WRITE(NOUT,*) "WARNING: REALLOCATING GROWING POINTER CAUSING GRAPH REINSTANTIATION"
DO I = 1, ALLOC%FREE_FUNCS_SZ
CALL ALLOC%FREE_FUNCS(I)%FUNC(ALLOC%GAM_PTR, &
SIZE(ALLOC%GAM_PTR, 1, C_SIZE_T))
ENDDO
#ifdef __HIP_PLATFORM_AMD__
CALL DEVICE_FREE(ALLOC%GAM_DEV_PTR)
#else
!$ACC EXIT DATA DELETE(ALLOC%GAM_PTR)
DEALLOCATE(ALLOC%GAM_PTR)
#endif
NULLIFY(ALLOC%GAM_PTR)
ENDIF

IF (.NOT. ASSOCIATED(ALLOC%GAM_PTR)) THEN
#ifdef __HIP_PLATFORM_AMD__
! This should be moved to an ACC_MALLOC or something similar but it doesn't seem to work.
CALL DEVICE_ALLOCATE(ALLOC%GAM_DEV_PTR,SZ)
!ALLOC%GAM_DEV_PTR = ACC_MALLOC(SZ)
CALL ACC_MAP_DATA(ALLOC%GAM_DEV_PTR, C_LOC(ALLOC%GAM_DEV_PTR),SZ)
CALL C_F_POINTER(C_LOC(ALLOC%GAM_DEV_PTR), ALLOC%GAM_PTR, [SZ])
#else
ALLOCATE(ALLOC%GAM_PTR(SZ))
!$ACC ENTER DATA CREATE(ALLOC%GAM_PTR)
#endif
ALLOC%FREE_FUNCS_SZ = 0
ENDIF
END SUBROUTINE

SUBROUTINE REGISTER_FREE_FUNCTION(ALLOC, FREE_FUNC)
USE ABORT_TRANS_MOD, ONLY: ABORT_TRANS
IMPLICIT NONE
Expand Down
48 changes: 48 additions & 0 deletions src/trans/gpu/algor/hip_allocator_mod.F90
Original file line number Diff line number Diff line change
@@ -0,0 +1,48 @@
MODULE HIP_ALLOCATOR_MOD
USE ISO_C_BINDING

IMPLICIT NONE
SAVE
PRIVATE

PUBLIC :: DEVICE_ALLOCATE, DEVICE_FREE

INTERFACE
SUBROUTINE HIPMALLOC(CPTR, PSIZE) BIND(C, NAME="hipMalloc")
USE ISO_C_BINDING, ONLY : C_PTR, C_SIZE_T
IMPLICIT NONE
TYPE(C_PTR) :: CPTR
INTEGER(C_SIZE_T), VALUE :: PSIZE
END SUBROUTINE HIPMALLOC

SUBROUTINE HIPFREE(PTR) BIND(C, NAME="hipFree")
USE ISO_C_BINDING, ONLY : C_PTR
IMPLICIT NONE
TYPE(C_PTR) :: PTR
END SUBROUTINE HIPFREE
END INTERFACE

CONTAINS

SUBROUTINE DEVICE_ALLOCATE(X, PSIZE)
USE ISO_C_BINDING, ONLY : C_PTR, C_SIZE_T, C_INT8_T
IMPLICIT NONE
INTEGER(C_INT8_T), DIMENSION(:), POINTER, INTENT(INOUT) :: X
INTEGER(C_SIZE_T), VALUE :: PSIZE
TYPE(C_PTR) :: PTR
PTR = C_LOC(X)
CALL HIPMALLOC(PTR, PSIZE)
CALL C_F_POINTER(PTR, X, [PSIZE])

END SUBROUTINE DEVICE_ALLOCATE

SUBROUTINE DEVICE_FREE(X)
USE ISO_C_BINDING, ONLY : C_PTR, C_INT8_T
IMPLICIT NONE
INTEGER(C_INT8_T), DIMENSION(:), POINTER, INTENT(INOUT) :: X
TYPE(C_PTR) :: PTR
PTR = C_LOC(X)
CALL HIPFREE(PTR)
END SUBROUTINE DEVICE_FREE

END MODULE HIP_ALLOCATOR_MOD
38 changes: 23 additions & 15 deletions src/trans/gpu/internal/trgtol_mod.F90
Original file line number Diff line number Diff line change
Expand Up @@ -12,19 +12,24 @@

MODULE TRGTOL_MOD
USE BUFFERED_ALLOCATOR_MOD, ONLY: ALLOCATION_RESERVATION_HANDLE
USE ISO_C_BINDING, ONLY: C_SIZE_T
IMPLICIT NONE

PRIVATE
PUBLIC :: TRGTOL_HANDLE, TRGTOL, PREPARE_TRGTOL

TYPE TRGTOL_HANDLE
TYPE(ALLOCATION_RESERVATION_HANDLE) :: HCOMBUFS, HCOMBUFR_AND_REEL
TYPE(ALLOCATION_RESERVATION_HANDLE) :: HCOMBUFS_COMBUFR
TYPE(ALLOCATION_RESERVATION_HANDLE) :: HREEL
INTEGER(KIND=C_SIZE_T) :: COMBUFS_START
INTEGER(KIND=C_SIZE_T) :: COMBUFR_START
INTEGER(KIND=C_SIZE_T) :: REEL_START
END TYPE
CONTAINS
FUNCTION PREPARE_TRGTOL(ALLOCATOR,KF_GP,KF_FS) RESULT(HTRGTOL)
USE PARKIND_ECTRANS, ONLY: JPIM, JPRB, JPRBT, JPIB
USE TPM_DISTR, ONLY: D
USE BUFFERED_ALLOCATOR_MOD, ONLY: BUFFERED_ALLOCATOR, RESERVE
USE BUFFERED_ALLOCATOR_MOD, ONLY: BUFFERED_ALLOCATOR, RESERVE, RESERVE_GAM
USE ISO_C_BINDING, ONLY: C_SIZE_T, C_SIZEOF

IMPLICIT NONE
Expand All @@ -37,12 +42,16 @@ FUNCTION PREPARE_TRGTOL(ALLOCATOR,KF_GP,KF_FS) RESULT(HTRGTOL)

INTEGER(KIND=JPIB) :: NELEM

HTRGTOL%HCOMBUFS = RESERVE(ALLOCATOR, 1_JPIB*KF_GP*D%NGPTOT*C_SIZEOF(DUMMY), "HTRGTOL%HCOMBUFS")
HTRGTOL%COMBUFS_START = 1
NELEM = ALIGN(1_JPIB*KF_GP*D%NGPTOT*C_SIZEOF(DUMMY), 128)

NELEM = 0
NELEM = NELEM + 1_JPIB*KF_FS*D%NLENGTF*C_SIZEOF(DUMMY) ! ZCOMBUFR
NELEM = NELEM + 1_JPIB*KF_FS*D%NLENGTF*C_SIZEOF(DUMMY) ! PREEL_REAL
HTRGTOL%HCOMBUFR_AND_REEL = RESERVE(ALLOCATOR, NELEM, "HTRGTOL%HCOMBUFR_AND_REEL")
HTRGTOL%COMBUFR_START = NELEM + 1
NELEM = NELEM + ALIGN(1_JPIB*KF_FS*D%NLENGTF*C_SIZEOF(DUMMY), 128)
HTRGTOL%HCOMBUFS_COMBUFR = RESERVE_GAM(ALLOCATOR, NELEM, "HTRGTOL%HCOMBUFR_COMBUFS")

HTRGTOL%REEL_START = 1
NELEM = ALIGN(1_JPIB*KF_FS*D%NLENGTF*C_SIZEOF(DUMMY), 128) ! PREEL_REAL
HTRGTOL%HREEL = RESERVE(ALLOCATOR, NELEM, "HTRGTOL%HREEL")
END FUNCTION PREPARE_TRGTOL

SUBROUTINE TRGTOL(ALLOCATOR,HTRGTOL,PREEL_REAL,KF_FS,KF_GP,KF_UV_G,KF_SCALARS_G,&
Expand Down Expand Up @@ -121,7 +130,7 @@ SUBROUTINE TRGTOL(ALLOCATOR,HTRGTOL,PREEL_REAL,KF_FS,KF_GP,KF_UV_G,KF_SCALARS_G,
USE TPM_STATS, ONLY: GSTATS => GSTATS_NVTX
USE TPM_TRANS, ONLY: NPROMA
USE ISO_C_BINDING, ONLY: C_SIZE_T, C_FLOAT, C_DOUBLE, C_INT8_T, C_SIZEOF
USE BUFFERED_ALLOCATOR_MOD, ONLY: BUFFERED_ALLOCATOR, ASSIGN_PTR, GET_ALLOCATION
USE BUFFERED_ALLOCATOR_MOD, ONLY: BUFFERED_ALLOCATOR, ASSIGN_PTR, GET_ALLOCATION, GET_ALLOCATION_GAM
USE OPENACC_EXT, ONLY: EXT_ACC_ARR_DESC, EXT_ACC_PASS, EXT_ACC_CREATE, &
& EXT_ACC_DELETE
USE OPENACC, ONLY: ACC_HANDLE_KIND
Expand Down Expand Up @@ -333,9 +342,8 @@ SUBROUTINE TRGTOL(ALLOCATOR,HTRGTOL,PREEL_REAL,KF_FS,KF_GP,KF_UV_G,KF_SCALARS_G,
ENDDO

block
CALL ASSIGN_PTR(PREEL_REAL, GET_ALLOCATION(ALLOCATOR, HTRGTOL%HCOMBUFR_AND_REEL),&
& 1_JPIB*KF_FS*D%NLENGTF*C_SIZEOF(PREEL_REAL(1))+1, 1_JPIB*KF_FS*D%NLENGTF*C_SIZEOF(PREEL_REAL(1)))
!!CALL ASSIGN_PTR(PREEL_REAL, GET_ALLOCATION(ALLOCATOR, HTRGTOL%HCOMBUFR_AND_REEL), size1, size2)
CALL ASSIGN_PTR(PREEL_REAL, GET_ALLOCATION(ALLOCATOR, HTRGTOL%HREEL),&
& HTRGTOL%REEL_START, 1_JPIB*KF_FS*D%NLENGTF*C_SIZEOF(PREEL_REAL(1)))
end block

#ifdef OMPGPU
Expand Down Expand Up @@ -464,8 +472,8 @@ SUBROUTINE TRGTOL(ALLOCATOR,HTRGTOL,PREEL_REAL,KF_FS,KF_GP,KF_UV_G,KF_SCALARS_G,
ENDDO

IF (ISEND_COUNTS > 0) THEN
CALL ASSIGN_PTR(ZCOMBUFS, GET_ALLOCATION(ALLOCATOR, HTRGTOL%HCOMBUFS),&
& 1_JPIB, ICOMBUFS_OFFSET(ISEND_COUNTS+1)*C_SIZEOF(ZCOMBUFS(1)))
CALL ASSIGN_PTR(ZCOMBUFS, GET_ALLOCATION_GAM(ALLOCATOR, HTRGTOL%HCOMBUFS_COMBUFR),&
& HTRGTOL%COMBUFS_START, ICOMBUFS_OFFSET(ISEND_COUNTS+1)*C_SIZEOF(ZCOMBUFS(1)))
ENDIF

!....Pack loop.........................................................
Expand Down Expand Up @@ -573,8 +581,8 @@ SUBROUTINE TRGTOL(ALLOCATOR,HTRGTOL,PREEL_REAL,KF_FS,KF_GP,KF_UV_G,KF_SCALARS_G,

CALL GSTATS(411,0)
IF (IRECV_COUNTS > 0) THEN
CALL ASSIGN_PTR(ZCOMBUFR, GET_ALLOCATION(ALLOCATOR, HTRGTOL%HCOMBUFR_AND_REEL),&
& 1_JPIB, ICOMBUFR_OFFSET(IRECV_COUNTS+1)*C_SIZEOF(ZCOMBUFR(1)))
CALL ASSIGN_PTR(ZCOMBUFR, GET_ALLOCATION_GAM(ALLOCATOR, HTRGTOL%HCOMBUFS_COMBUFR),&
& HTRGTOL%COMBUFR_START, ICOMBUFR_OFFSET(IRECV_COUNTS+1)*C_SIZEOF(ZCOMBUFR(1)))
ENDIF
#ifdef OMPGPU
#endif
Expand Down
Loading