diff --git a/src/trans/gpu/CMakeLists.txt b/src/trans/gpu/CMakeLists.txt index af4e07e5..748ab634 100644 --- a/src/trans/gpu/CMakeLists.txt +++ b/src/trans/gpu/CMakeLists.txt @@ -11,6 +11,7 @@ list( APPEND trans_gpu_common_src algor/ext_acc.F90 algor/c_hipmemgetinfo.cpp + algor/hip_allocator_mod.F90 algor/buffered_allocator_mod.F90 algor/device_mod.F90 algor/growing_allocator_mod.F90 diff --git a/src/trans/gpu/algor/buffered_allocator_mod.F90 b/src/trans/gpu/algor/buffered_allocator_mod.F90 index ba613fe7..da3ca8a4 100644 --- a/src/trans/gpu/algor/buffered_allocator_mod.F90 +++ b/src/trans/gpu/algor/buffered_allocator_mod.F90 @@ -17,7 +17,7 @@ MODULE BUFFERED_ALLOCATOR_MOD IMPLICIT NONE PRIVATE - PUBLIC :: BUFFERED_ALLOCATOR, ALLOCATION_RESERVATION_HANDLE, RESERVE, ASSIGN_PTR, GET_ALLOCATION + PUBLIC :: BUFFERED_ALLOCATOR, ALLOCATION_RESERVATION_HANDLE, RESERVE, RESERVE_GAM, ASSIGN_PTR, GET_ALLOCATION, GET_ALLOCATION_GAM PUBLIC :: MAKE_BUFFERED_ALLOCATOR, INSTANTIATE_ALLOCATOR ! The buffered allocator uses double buffering. The idea is that the allocator @@ -44,10 +44,19 @@ MODULE BUFFERED_ALLOCATOR_MOD INTEGER(KIND=C_SIZE_T) :: BUFR_SZ(0:NBUF-1) INTEGER(KIND=JPIM) :: NEXT_BUF TYPE(GROWING_ALLOCATION_TYPE), POINTER :: PTR + + ! GPU Aware MPI versions + INTEGER(KIND=C_SIZE_T) :: GAM_BUFR_SZ(0:NBUF-1) + INTEGER(KIND=JPIM) :: GAM_NEXT_BUF + TYPE(GROWING_ALLOCATION_TYPE), POINTER :: GAM_PTR END TYPE TYPE ALLOCATION_RESERVATION_HANDLE INTEGER(KIND=C_SIZE_T) :: SZ INTEGER(KIND=JPIM) :: BUF + + ! GPU Aware MPI versions + INTEGER(KIND=C_SIZE_T) :: GAM_SZ + INTEGER(KIND=JPIM) :: GAM_BUF END TYPE INTERFACE ASSIGN_PTR @@ -67,6 +76,9 @@ FUNCTION MAKE_BUFFERED_ALLOCATOR() MAKE_BUFFERED_ALLOCATOR%BUFR_SZ(:) = 0 MAKE_BUFFERED_ALLOCATOR%NEXT_BUF = 0 + + MAKE_BUFFERED_ALLOCATOR%GAM_BUFR_SZ(:) = 0 + MAKE_BUFFERED_ALLOCATOR%GAM_NEXT_BUF = 0 END FUNCTION MAKE_BUFFERED_ALLOCATOR FUNCTION RESERVE(ALLOCATOR, SZ, WHO) @@ -84,8 +96,23 @@ FUNCTION RESERVE(ALLOCATOR, SZ, WHO) ALLOCATOR%NEXT_BUF = MOD(ALLOCATOR%NEXT_BUF+1,NBUF) END FUNCTION RESERVE + FUNCTION RESERVE_GAM(ALLOCATOR, SZ, WHO) + IMPLICIT NONE + TYPE(BUFFERED_ALLOCATOR), INTENT(INOUT) :: ALLOCATOR + INTEGER(KIND=C_SIZE_T), INTENT(IN) :: SZ + CHARACTER(*), INTENT(IN), OPTIONAL :: WHO + + TYPE(ALLOCATION_RESERVATION_HANDLE) :: RESERVE_GAM + + ALLOCATOR%GAM_BUFR_SZ(ALLOCATOR%GAM_NEXT_BUF) = MAX(ALLOCATOR%GAM_BUFR_SZ(ALLOCATOR%GAM_NEXT_BUF),SZ) + RESERVE_GAM%GAM_BUF = ALLOCATOR%GAM_NEXT_BUF + RESERVE_GAM%GAM_SZ = SZ + + ALLOCATOR%GAM_NEXT_BUF = MOD(ALLOCATOR%GAM_NEXT_BUF+1,NBUF) + END FUNCTION RESERVE_GAM + SUBROUTINE INSTANTIATE_ALLOCATOR(ALLOCATOR, GROWING_ALLOCATION) - USE GROWING_ALLOCATOR_MOD, ONLY: REALLOCATE_GROWING_ALLOCATION + USE GROWING_ALLOCATOR_MOD, ONLY: REALLOCATE_GROWING_ALLOCATION, REALLOCATE_GROWING_GAM_ALLOCATION IMPLICIT NONE TYPE(BUFFERED_ALLOCATOR), INTENT(INOUT) :: ALLOCATOR !!TYPE(GROWING_ALLOCATION_TYPE), INTENT(IN), POINTER :: GROWING_ALLOCATION @@ -98,6 +125,13 @@ SUBROUTINE INSTANTIATE_ALLOCATOR(ALLOCATOR, GROWING_ALLOCATION) ALLOCATOR%PTR => GROWING_ALLOCATION CALL REALLOCATE_GROWING_ALLOCATION(GROWING_ALLOCATION, SUM(ALLOCATOR%BUFR_SZ)) + + DO I = 0, NBUF-1 + ALLOCATOR%GAM_BUFR_SZ(I) = ALIGN(ALLOCATOR%GAM_BUFR_SZ(I),128) + ENDDO + ALLOCATOR%GAM_PTR => GROWING_ALLOCATION + + CALL REALLOCATE_GROWING_GAM_ALLOCATION(GROWING_ALLOCATION, SUM(ALLOCATOR%GAM_BUFR_SZ)) END SUBROUTINE FUNCTION GET_ALLOCATION(ALLOCATOR, RESERVATION) @@ -118,6 +152,24 @@ FUNCTION GET_ALLOCATION(ALLOCATOR, RESERVATION) ENDIF END FUNCTION GET_ALLOCATION + FUNCTION GET_ALLOCATION_GAM(ALLOCATOR, RESERVATION) + IMPLICIT NONE + TYPE(BUFFERED_ALLOCATOR), INTENT(IN) :: ALLOCATOR + TYPE(ALLOCATION_RESERVATION_HANDLE), INTENT(IN) :: RESERVATION + + INTEGER(KIND=C_INT8_T), POINTER :: GET_ALLOCATION_GAM(:) + + IF (RESERVATION%GAM_SZ > ALLOCATOR%GAM_BUFR_SZ(RESERVATION%GAM_BUF)) THEN + CALL ABORT_TRANS( "Logical Error in GET_ALLOCATION_GAM") + ENDIF + IF (RESERVATION%GAM_BUF == 0) THEN + GET_ALLOCATION_GAM(1:) => ALLOCATOR%GAM_PTR%GAM_PTR(1:RESERVATION%GAM_SZ) + ELSE + GET_ALLOCATION_GAM(1:) => ALLOCATOR%GAM_PTR%GAM_PTR(SUM(ALLOCATOR%GAM_BUFR_SZ(0:RESERVATION%GAM_BUF-1))+1: & + SUM(ALLOCATOR%GAM_BUFR_SZ(0:RESERVATION%GAM_BUF-1))+RESERVATION%GAM_SZ) + ENDIF + END FUNCTION GET_ALLOCATION_GAM + SUBROUTINE ASSIGN_PTR_FLOAT(DST, SRC, START_IN_BYTES, LENGTH_IN_BYTES, SET_VALUE, SET_STREAM) USE ISO_C_BINDING, ONLY: C_FLOAT, C_F_POINTER, C_SIZEOF IMPLICIT NONE diff --git a/src/trans/gpu/algor/growing_allocator_mod.F90 b/src/trans/gpu/algor/growing_allocator_mod.F90 index db869e48..b5b60fbb 100644 --- a/src/trans/gpu/algor/growing_allocator_mod.F90 +++ b/src/trans/gpu/algor/growing_allocator_mod.F90 @@ -1,15 +1,16 @@ MODULE GROWING_ALLOCATOR_MOD - USE ISO_C_BINDING, ONLY: C_INT8_T + USE HIP_ALLOCATOR_MOD + USE ISO_C_BINDING, ONLY: C_INT8_T, C_PTR PRIVATE PUBLIC :: GROWING_ALLOCATION_TYPE - PUBLIC :: REALLOCATE_GROWING_ALLOCATION, REGISTER_FREE_FUNCTION + PUBLIC :: REALLOCATE_GROWING_ALLOCATION, REGISTER_FREE_FUNCTION, REALLOCATE_GROWING_GAM_ALLOCATION PUBLIC :: DESTROY_GROWING_ALLOCATOR ABSTRACT INTERFACE SUBROUTINE FREE_FUNC_PROC(PTR, SZ) BIND(C) - USE ISO_C_BINDING, ONLY: C_SIZE_T, C_INT8_T + USE ISO_C_BINDING, ONLY: C_SIZE_T, C_INT8_T, C_PTR IMPLICIT NONE INTEGER(KIND=C_INT8_T), TARGET :: PTR(:) INTEGER(C_SIZE_T), VALUE :: SZ @@ -21,9 +22,13 @@ SUBROUTINE FREE_FUNC_PROC(PTR, SZ) BIND(C) END TYPE TYPE GROWING_ALLOCATION_TYPE + ! Regular allocations INTEGER(KIND=C_INT8_T), POINTER :: PTR(:) TYPE(FREE_FUNC_TYPE) :: FREE_FUNCS(10) INTEGER :: FREE_FUNCS_SZ + ! GPU aware MPI weirdness + INTEGER(KIND=C_INT8_T), POINTER :: GAM_PTR(:) + INTEGER(KIND=C_INT8_T), POINTER :: GAM_DEV_PTR(:) END TYPE CONTAINS @@ -48,6 +53,47 @@ SUBROUTINE REALLOCATE_GROWING_ALLOCATION(ALLOC, SZ) ENDIF END SUBROUTINE + SUBROUTINE REALLOCATE_GROWING_GAM_ALLOCATION(ALLOC, SZ) + USE ISO_C_BINDING + USE OPENACC + USE TPM_GEN, ONLY: NOUT + USE HIP_ALLOCATOR_MOD, ONLY: DEVICE_ALLOCATE, DEVICE_FREE + IMPLICIT NONE + TYPE(GROWING_ALLOCATION_TYPE), INTENT(INOUT) :: ALLOC + INTEGER(C_SIZE_T) :: SZ + INTEGER :: I + + ! Deallocate existing pointer + IF (ASSOCIATED(ALLOC%GAM_PTR) .AND. SZ > SIZE(ALLOC%GAM_PTR, 1, C_SIZE_T)) THEN + WRITE(NOUT,*) "WARNING: REALLOCATING GROWING POINTER CAUSING GRAPH REINSTANTIATION" + DO I = 1, ALLOC%FREE_FUNCS_SZ + CALL ALLOC%FREE_FUNCS(I)%FUNC(ALLOC%GAM_PTR, & + SIZE(ALLOC%GAM_PTR, 1, C_SIZE_T)) + ENDDO +#ifdef __HIP_PLATFORM_AMD__ + CALL DEVICE_FREE(ALLOC%GAM_DEV_PTR) +#else + !$ACC EXIT DATA DELETE(ALLOC%GAM_PTR) + DEALLOCATE(ALLOC%GAM_PTR) +#endif + NULLIFY(ALLOC%GAM_PTR) + ENDIF + + IF (.NOT. ASSOCIATED(ALLOC%GAM_PTR)) THEN +#ifdef __HIP_PLATFORM_AMD__ + ! This should be moved to an ACC_MALLOC or something similar but it doesn't seem to work. + CALL DEVICE_ALLOCATE(ALLOC%GAM_DEV_PTR,SZ) + !ALLOC%GAM_DEV_PTR = ACC_MALLOC(SZ) + CALL ACC_MAP_DATA(ALLOC%GAM_DEV_PTR, C_LOC(ALLOC%GAM_DEV_PTR),SZ) + CALL C_F_POINTER(C_LOC(ALLOC%GAM_DEV_PTR), ALLOC%GAM_PTR, [SZ]) +#else + ALLOCATE(ALLOC%GAM_PTR(SZ)) + !$ACC ENTER DATA CREATE(ALLOC%GAM_PTR) +#endif + ALLOC%FREE_FUNCS_SZ = 0 + ENDIF + END SUBROUTINE + SUBROUTINE REGISTER_FREE_FUNCTION(ALLOC, FREE_FUNC) USE ABORT_TRANS_MOD, ONLY: ABORT_TRANS IMPLICIT NONE diff --git a/src/trans/gpu/algor/hip_allocator_mod.F90 b/src/trans/gpu/algor/hip_allocator_mod.F90 new file mode 100644 index 00000000..b282f150 --- /dev/null +++ b/src/trans/gpu/algor/hip_allocator_mod.F90 @@ -0,0 +1,48 @@ +MODULE HIP_ALLOCATOR_MOD + USE ISO_C_BINDING + + IMPLICIT NONE + SAVE + PRIVATE + + PUBLIC :: DEVICE_ALLOCATE, DEVICE_FREE + + INTERFACE + SUBROUTINE HIPMALLOC(CPTR, PSIZE) BIND(C, NAME="hipMalloc") + USE ISO_C_BINDING, ONLY : C_PTR, C_SIZE_T + IMPLICIT NONE + TYPE(C_PTR) :: CPTR + INTEGER(C_SIZE_T), VALUE :: PSIZE + END SUBROUTINE HIPMALLOC + + SUBROUTINE HIPFREE(PTR) BIND(C, NAME="hipFree") + USE ISO_C_BINDING, ONLY : C_PTR + IMPLICIT NONE + TYPE(C_PTR) :: PTR + END SUBROUTINE HIPFREE + END INTERFACE + +CONTAINS + + SUBROUTINE DEVICE_ALLOCATE(X, PSIZE) + USE ISO_C_BINDING, ONLY : C_PTR, C_SIZE_T, C_INT8_T + IMPLICIT NONE + INTEGER(C_INT8_T), DIMENSION(:), POINTER, INTENT(INOUT) :: X + INTEGER(C_SIZE_T), VALUE :: PSIZE + TYPE(C_PTR) :: PTR + PTR = C_LOC(X) + CALL HIPMALLOC(PTR, PSIZE) + CALL C_F_POINTER(PTR, X, [PSIZE]) + + END SUBROUTINE DEVICE_ALLOCATE + + SUBROUTINE DEVICE_FREE(X) + USE ISO_C_BINDING, ONLY : C_PTR, C_INT8_T + IMPLICIT NONE + INTEGER(C_INT8_T), DIMENSION(:), POINTER, INTENT(INOUT) :: X + TYPE(C_PTR) :: PTR + PTR = C_LOC(X) + CALL HIPFREE(PTR) + END SUBROUTINE DEVICE_FREE + +END MODULE HIP_ALLOCATOR_MOD diff --git a/src/trans/gpu/internal/trgtol_mod.F90 b/src/trans/gpu/internal/trgtol_mod.F90 index b2bebb58..d331e92a 100755 --- a/src/trans/gpu/internal/trgtol_mod.F90 +++ b/src/trans/gpu/internal/trgtol_mod.F90 @@ -12,19 +12,24 @@ MODULE TRGTOL_MOD USE BUFFERED_ALLOCATOR_MOD, ONLY: ALLOCATION_RESERVATION_HANDLE + USE ISO_C_BINDING, ONLY: C_SIZE_T IMPLICIT NONE PRIVATE PUBLIC :: TRGTOL_HANDLE, TRGTOL, PREPARE_TRGTOL TYPE TRGTOL_HANDLE - TYPE(ALLOCATION_RESERVATION_HANDLE) :: HCOMBUFS, HCOMBUFR_AND_REEL + TYPE(ALLOCATION_RESERVATION_HANDLE) :: HCOMBUFS_COMBUFR + TYPE(ALLOCATION_RESERVATION_HANDLE) :: HREEL + INTEGER(KIND=C_SIZE_T) :: COMBUFS_START + INTEGER(KIND=C_SIZE_T) :: COMBUFR_START + INTEGER(KIND=C_SIZE_T) :: REEL_START END TYPE CONTAINS FUNCTION PREPARE_TRGTOL(ALLOCATOR,KF_GP,KF_FS) RESULT(HTRGTOL) USE PARKIND_ECTRANS, ONLY: JPIM, JPRB, JPRBT, JPIB USE TPM_DISTR, ONLY: D - USE BUFFERED_ALLOCATOR_MOD, ONLY: BUFFERED_ALLOCATOR, RESERVE + USE BUFFERED_ALLOCATOR_MOD, ONLY: BUFFERED_ALLOCATOR, RESERVE, RESERVE_GAM USE ISO_C_BINDING, ONLY: C_SIZE_T, C_SIZEOF IMPLICIT NONE @@ -37,12 +42,16 @@ FUNCTION PREPARE_TRGTOL(ALLOCATOR,KF_GP,KF_FS) RESULT(HTRGTOL) INTEGER(KIND=JPIB) :: NELEM - HTRGTOL%HCOMBUFS = RESERVE(ALLOCATOR, 1_JPIB*KF_GP*D%NGPTOT*C_SIZEOF(DUMMY), "HTRGTOL%HCOMBUFS") + HTRGTOL%COMBUFS_START = 1 + NELEM = ALIGN(1_JPIB*KF_GP*D%NGPTOT*C_SIZEOF(DUMMY), 128) - NELEM = 0 - NELEM = NELEM + 1_JPIB*KF_FS*D%NLENGTF*C_SIZEOF(DUMMY) ! ZCOMBUFR - NELEM = NELEM + 1_JPIB*KF_FS*D%NLENGTF*C_SIZEOF(DUMMY) ! PREEL_REAL - HTRGTOL%HCOMBUFR_AND_REEL = RESERVE(ALLOCATOR, NELEM, "HTRGTOL%HCOMBUFR_AND_REEL") + HTRGTOL%COMBUFR_START = NELEM + 1 + NELEM = NELEM + ALIGN(1_JPIB*KF_FS*D%NLENGTF*C_SIZEOF(DUMMY), 128) + HTRGTOL%HCOMBUFS_COMBUFR = RESERVE_GAM(ALLOCATOR, NELEM, "HTRGTOL%HCOMBUFR_COMBUFS") + + HTRGTOL%REEL_START = 1 + NELEM = ALIGN(1_JPIB*KF_FS*D%NLENGTF*C_SIZEOF(DUMMY), 128) ! PREEL_REAL + HTRGTOL%HREEL = RESERVE(ALLOCATOR, NELEM, "HTRGTOL%HREEL") END FUNCTION PREPARE_TRGTOL SUBROUTINE TRGTOL(ALLOCATOR,HTRGTOL,PREEL_REAL,KF_FS,KF_GP,KF_UV_G,KF_SCALARS_G,& @@ -121,7 +130,7 @@ SUBROUTINE TRGTOL(ALLOCATOR,HTRGTOL,PREEL_REAL,KF_FS,KF_GP,KF_UV_G,KF_SCALARS_G, USE TPM_STATS, ONLY: GSTATS => GSTATS_NVTX USE TPM_TRANS, ONLY: NPROMA USE ISO_C_BINDING, ONLY: C_SIZE_T, C_FLOAT, C_DOUBLE, C_INT8_T, C_SIZEOF - USE BUFFERED_ALLOCATOR_MOD, ONLY: BUFFERED_ALLOCATOR, ASSIGN_PTR, GET_ALLOCATION + USE BUFFERED_ALLOCATOR_MOD, ONLY: BUFFERED_ALLOCATOR, ASSIGN_PTR, GET_ALLOCATION, GET_ALLOCATION_GAM USE OPENACC_EXT, ONLY: EXT_ACC_ARR_DESC, EXT_ACC_PASS, EXT_ACC_CREATE, & & EXT_ACC_DELETE USE OPENACC, ONLY: ACC_HANDLE_KIND @@ -333,9 +342,8 @@ SUBROUTINE TRGTOL(ALLOCATOR,HTRGTOL,PREEL_REAL,KF_FS,KF_GP,KF_UV_G,KF_SCALARS_G, ENDDO block - CALL ASSIGN_PTR(PREEL_REAL, GET_ALLOCATION(ALLOCATOR, HTRGTOL%HCOMBUFR_AND_REEL),& - & 1_JPIB*KF_FS*D%NLENGTF*C_SIZEOF(PREEL_REAL(1))+1, 1_JPIB*KF_FS*D%NLENGTF*C_SIZEOF(PREEL_REAL(1))) - !!CALL ASSIGN_PTR(PREEL_REAL, GET_ALLOCATION(ALLOCATOR, HTRGTOL%HCOMBUFR_AND_REEL), size1, size2) + CALL ASSIGN_PTR(PREEL_REAL, GET_ALLOCATION(ALLOCATOR, HTRGTOL%HREEL),& + & HTRGTOL%REEL_START, 1_JPIB*KF_FS*D%NLENGTF*C_SIZEOF(PREEL_REAL(1))) end block #ifdef OMPGPU @@ -464,8 +472,8 @@ SUBROUTINE TRGTOL(ALLOCATOR,HTRGTOL,PREEL_REAL,KF_FS,KF_GP,KF_UV_G,KF_SCALARS_G, ENDDO IF (ISEND_COUNTS > 0) THEN - CALL ASSIGN_PTR(ZCOMBUFS, GET_ALLOCATION(ALLOCATOR, HTRGTOL%HCOMBUFS),& - & 1_JPIB, ICOMBUFS_OFFSET(ISEND_COUNTS+1)*C_SIZEOF(ZCOMBUFS(1))) + CALL ASSIGN_PTR(ZCOMBUFS, GET_ALLOCATION_GAM(ALLOCATOR, HTRGTOL%HCOMBUFS_COMBUFR),& + & HTRGTOL%COMBUFS_START, ICOMBUFS_OFFSET(ISEND_COUNTS+1)*C_SIZEOF(ZCOMBUFS(1))) ENDIF !....Pack loop......................................................... @@ -573,8 +581,8 @@ SUBROUTINE TRGTOL(ALLOCATOR,HTRGTOL,PREEL_REAL,KF_FS,KF_GP,KF_UV_G,KF_SCALARS_G, CALL GSTATS(411,0) IF (IRECV_COUNTS > 0) THEN - CALL ASSIGN_PTR(ZCOMBUFR, GET_ALLOCATION(ALLOCATOR, HTRGTOL%HCOMBUFR_AND_REEL),& - & 1_JPIB, ICOMBUFR_OFFSET(IRECV_COUNTS+1)*C_SIZEOF(ZCOMBUFR(1))) + CALL ASSIGN_PTR(ZCOMBUFR, GET_ALLOCATION_GAM(ALLOCATOR, HTRGTOL%HCOMBUFS_COMBUFR),& + & HTRGTOL%COMBUFR_START, ICOMBUFR_OFFSET(IRECV_COUNTS+1)*C_SIZEOF(ZCOMBUFR(1))) ENDIF #ifdef OMPGPU #endif diff --git a/src/trans/gpu/internal/trltog_mod.F90 b/src/trans/gpu/internal/trltog_mod.F90 index 905967a6..ddfb73b4 100755 --- a/src/trans/gpu/internal/trltog_mod.F90 +++ b/src/trans/gpu/internal/trltog_mod.F90 @@ -12,19 +12,22 @@ MODULE TRLTOG_MOD USE BUFFERED_ALLOCATOR_MOD, ONLY: ALLOCATION_RESERVATION_HANDLE + USE ISO_C_BINDING, ONLY: C_SIZE_T IMPLICIT NONE PRIVATE PUBLIC :: TRLTOG, TRLTOG_HANDLE, PREPARE_TRLTOG TYPE TRLTOG_HANDLE - TYPE(ALLOCATION_RESERVATION_HANDLE) :: HCOMBUFR_AND_COMBUFS + TYPE(ALLOCATION_RESERVATION_HANDLE) :: HCOMBUFR_COMBUFS + INTEGER(KIND=C_SIZE_T) :: COMBUFS_START + INTEGER(KIND=C_SIZE_T) :: COMBUFR_START END TYPE CONTAINS FUNCTION PREPARE_TRLTOG(ALLOCATOR,KF_FS,KF_GP) RESULT(HTRLTOG) USE PARKIND_ECTRANS, ONLY: JPIM, JPRBT, JPIB USE TPM_DISTR, ONLY: D - USE BUFFERED_ALLOCATOR_MOD, ONLY: BUFFERED_ALLOCATOR, RESERVE + USE BUFFERED_ALLOCATOR_MOD, ONLY: BUFFERED_ALLOCATOR, RESERVE_GAM USE ISO_C_BINDING, ONLY: C_SIZE_T, C_SIZEOF IMPLICIT NONE @@ -37,11 +40,13 @@ FUNCTION PREPARE_TRLTOG(ALLOCATOR,KF_FS,KF_GP) RESULT(HTRLTOG) INTEGER(KIND=JPIB) :: NELEM - NELEM = 0 - NELEM = NELEM + ALIGN(1_JPIB*KF_GP*D%NGPTOT*C_SIZEOF(DUMMY),128) ! ZCOMBUFR - NELEM = NELEM + ALIGN(1_JPIB*KF_FS*D%NLENGTF*C_SIZEOF(DUMMY),128) !ZCOMBUFS upper obund + HTRLTOG%COMBUFR_START = 1 + NELEM = ALIGN(1_JPIB*KF_GP*D%NGPTOT*C_SIZEOF(DUMMY), 128) ! ZCOMBUFR - HTRLTOG%HCOMBUFR_AND_COMBUFS = RESERVE(ALLOCATOR, NELEM, "HTRLTOG%HCOMBUFR_AND_COMBUFS") + HTRLTOG%COMBUFS_START = 1 + NELEM + NELEM = NELEM + ALIGN(1_JPIB*KF_FS*D%NLENGTF*C_SIZEOF(DUMMY), 128) !ZCOMBUFS upper bound + + HTRLTOG%HCOMBUFR_COMBUFS = RESERVE_GAM(ALLOCATOR, NELEM, "HTRLTOG%HCOMBUFR_AND_COMBUFS") END FUNCTION PREPARE_TRLTOG SUBROUTINE TRLTOG(ALLOCATOR,HTRLTOG,PREEL_REAL,KF_FS,KF_GP,KF_UV_G,KF_SCALARS_G,KPTRGP,& @@ -121,7 +126,7 @@ SUBROUTINE TRLTOG(ALLOCATOR,HTRLTOG,PREEL_REAL,KF_FS,KF_GP,KF_UV_G,KF_SCALARS_G, #endif USE TPM_STATS, ONLY: GSTATS => GSTATS_NVTX USE TPM_TRANS, ONLY: LDIVGP, LSCDERS, LUVDER, LVORGP, NPROMA - USE BUFFERED_ALLOCATOR_MOD, ONLY: BUFFERED_ALLOCATOR, ASSIGN_PTR, GET_ALLOCATION + USE BUFFERED_ALLOCATOR_MOD, ONLY: BUFFERED_ALLOCATOR, ASSIGN_PTR, GET_ALLOCATION_GAM USE ISO_C_BINDING, ONLY: C_SIZE_T, C_SIZEOF USE OPENACC_EXT, ONLY: EXT_ACC_ARR_DESC, EXT_ACC_PASS, EXT_ACC_CREATE, & & EXT_ACC_DELETE @@ -653,13 +658,12 @@ SUBROUTINE TRLTOG(ALLOCATOR,HTRLTOG,PREEL_REAL,KF_FS,KF_GP,KF_UV_G,KF_SCALARS_G, ENDDO IF (IRECV_COUNTS > 0) THEN - CALL ASSIGN_PTR(ZCOMBUFR, GET_ALLOCATION(ALLOCATOR, HTRLTOG%HCOMBUFR_AND_COMBUFS),& - & 1_JPIB, ICOMBUFR_OFFSET(IRECV_COUNTS+1)*C_SIZEOF(ZCOMBUFR(1))) + CALL ASSIGN_PTR(ZCOMBUFR, GET_ALLOCATION_GAM(ALLOCATOR, HTRLTOG%HCOMBUFR_COMBUFS),& + & HTRLTOG%COMBUFR_START, ICOMBUFR_OFFSET(IRECV_COUNTS+1)*C_SIZEOF(ZCOMBUFR(1))) ENDIF IF (ISEND_COUNTS > 0) THEN - CALL ASSIGN_PTR(ZCOMBUFS, GET_ALLOCATION(ALLOCATOR, HTRLTOG%HCOMBUFR_AND_COMBUFS),& - & ALIGN(1_JPIB*KF_GP*D%NGPTOT*C_SIZEOF(ZCOMBUFR(1)),128)+1, & - & ICOMBUFS_OFFSET(ISEND_COUNTS+1)*C_SIZEOF(ZCOMBUFS(1))) + CALL ASSIGN_PTR(ZCOMBUFS, GET_ALLOCATION_GAM(ALLOCATOR, HTRLTOG%HCOMBUFR_COMBUFS),& + & HTRLTOG%COMBUFS_START, ICOMBUFS_OFFSET(ISEND_COUNTS+1)*C_SIZEOF(ZCOMBUFS(1))) ENDIF #ifdef OMPGPU diff --git a/src/trans/gpu/internal/trltom_mod.F90 b/src/trans/gpu/internal/trltom_mod.F90 index 173a0493..2d66c433 100755 --- a/src/trans/gpu/internal/trltom_mod.F90 +++ b/src/trans/gpu/internal/trltom_mod.F90 @@ -23,7 +23,7 @@ MODULE TRLTOM_MOD FUNCTION PREPARE_TRLTOM(ALLOCATOR, KF_FS) RESULT(HTRLTOM) USE PARKIND_ECTRANS, ONLY: JPIM, JPRBT, JPIB USE TPM_DISTR, ONLY: D - USE BUFFERED_ALLOCATOR_MOD, ONLY: BUFFERED_ALLOCATOR, RESERVE + USE BUFFERED_ALLOCATOR_MOD, ONLY: BUFFERED_ALLOCATOR, RESERVE_GAM USE ISO_C_BINDING, ONLY: C_SIZE_T, C_SIZEOF IMPLICIT NONE @@ -34,7 +34,7 @@ FUNCTION PREPARE_TRLTOM(ALLOCATOR, KF_FS) RESULT(HTRLTOM) REAL(KIND=JPRBT) :: DUMMY - HTRLTOM%HPFBUF = RESERVE(ALLOCATOR, 2_JPIB*D%NLENGT1B*KF_FS*C_SIZEOF(DUMMY), "HTRLTOM%HPFBUF") + HTRLTOM%HPFBUF = RESERVE_GAM(ALLOCATOR, 2_JPIB*D%NLENGT1B*KF_FS*C_SIZEOF(DUMMY), "HTRLTOM%HPFBUF") END FUNCTION SUBROUTINE TRLTOM(ALLOCATOR,HTRLTOM,PFBUF_IN,PFBUF,KF_FS) @@ -98,7 +98,7 @@ SUBROUTINE TRLTOM(ALLOCATOR,HTRLTOM,PFBUF_IN,PFBUF,KF_FS) ! Missing: MPI_ALLTOALLV on purpose due to cray-mpi bug (see https://github.com/ecmwf-ifs/ectrans/pull/157) #endif USE TPM_STATS, ONLY: GSTATS => GSTATS_NVTX - USE BUFFERED_ALLOCATOR_MOD, ONLY: BUFFERED_ALLOCATOR, ASSIGN_PTR, GET_ALLOCATION + USE BUFFERED_ALLOCATOR_MOD, ONLY: BUFFERED_ALLOCATOR, ASSIGN_PTR, GET_ALLOCATION_GAM USE ISO_C_BINDING, ONLY: C_SIZE_T, C_SIZEOF USE ABORT_TRANS_MOD, ONLY: ABORT_TRANS @@ -134,7 +134,7 @@ SUBROUTINE TRLTOM(ALLOCATOR,HTRLTOM,PFBUF_IN,PFBUF,KF_FS) IF (LHOOK) CALL DR_HOOK('TRLTOM',0,ZHOOK_HANDLE) - CALL ASSIGN_PTR(PFBUF, GET_ALLOCATION(ALLOCATOR, HTRLTOM%HPFBUF),& + CALL ASSIGN_PTR(PFBUF, GET_ALLOCATION_GAM(ALLOCATOR, HTRLTOM%HPFBUF),& & 1_JPIB, 2_JPIB*D%NLENGT1B*KF_FS*C_SIZEOF(PFBUF(1))) #ifdef OMPGPU diff --git a/src/trans/gpu/internal/trltom_pack_unpack.F90 b/src/trans/gpu/internal/trltom_pack_unpack.F90 index 3539028f..3fb72264 100755 --- a/src/trans/gpu/internal/trltom_pack_unpack.F90 +++ b/src/trans/gpu/internal/trltom_pack_unpack.F90 @@ -33,7 +33,7 @@ FUNCTION PREPARE_TRLTOM_PACK(ALLOCATOR, KF_FS) RESULT(HTRLTOM_PACK) USE PARKIND_ECTRANS, ONLY: JPIM, JPRBT, JPIB USE TPM_DISTR, ONLY: D USE ISO_C_BINDING, ONLY: C_SIZE_T, C_SIZEOF - USE BUFFERED_ALLOCATOR_MOD, ONLY: BUFFERED_ALLOCATOR, RESERVE + USE BUFFERED_ALLOCATOR_MOD, ONLY: BUFFERED_ALLOCATOR, RESERVE_GAM IMPLICIT NONE @@ -43,7 +43,7 @@ FUNCTION PREPARE_TRLTOM_PACK(ALLOCATOR, KF_FS) RESULT(HTRLTOM_PACK) REAL(KIND=JPRBT) :: DUMMY - HTRLTOM_PACK%HFOUBUF_IN = RESERVE(ALLOCATOR, 2_JPIB*D%NLENGT0B*KF_FS*C_SIZEOF(DUMMY), "HTRLTOM_PACK%HFOUBUF_IN") + HTRLTOM_PACK%HFOUBUF_IN = RESERVE_GAM(ALLOCATOR, 2_JPIB*D%NLENGT0B*KF_FS*C_SIZEOF(DUMMY), "HTRLTOM_PACK%HFOUBUF_IN") END FUNCTION PREPARE_TRLTOM_PACK SUBROUTINE TRLTOM_PACK(ALLOCATOR,HTRLTOM_PACK,PREEL_COMPLEX,FOUBUF_IN,KF_FS) @@ -69,7 +69,7 @@ SUBROUTINE TRLTOM_PACK(ALLOCATOR,HTRLTOM_PACK,PREEL_COMPLEX,FOUBUF_IN,KF_FS) ! ------------------------------------------------------------------ - USE BUFFERED_ALLOCATOR_MOD, ONLY: BUFFERED_ALLOCATOR, ASSIGN_PTR, GET_ALLOCATION + USE BUFFERED_ALLOCATOR_MOD, ONLY: BUFFERED_ALLOCATOR, ASSIGN_PTR, GET_ALLOCATION_GAM USE PARKIND_ECTRANS, ONLY: JPIM, JPRBT, JPIB USE TPM_DISTR, ONLY: D, MYSETW USE TPM_GEOMETRY, ONLY: G @@ -93,7 +93,7 @@ SUBROUTINE TRLTOM_PACK(ALLOCATOR,HTRLTOM_PACK,PREEL_COMPLEX,FOUBUF_IN,KF_FS) ASSOCIATE(D_NSTAGTF=>D%NSTAGTF, D_NPNTGTB0=>D%NPNTGTB0, D_NPTRLS=>D%NPTRLS, & & D_NDGL_FS=>D%NDGL_FS, G_NMEN=>G%NMEN, G_NLOEN=>G%NLOEN, R_NSMAX=>R%NSMAX) - CALL ASSIGN_PTR(FOUBUF_IN, GET_ALLOCATION(ALLOCATOR, HTRLTOM_PACK%HFOUBUF_IN),& + CALL ASSIGN_PTR(FOUBUF_IN, GET_ALLOCATION_GAM(ALLOCATOR, HTRLTOM_PACK%HFOUBUF_IN),& & 1_JPIB, 2_JPIB*D%NLENGT0B*KF_FS*C_SIZEOF(FOUBUF_IN(1))) #ifdef OMPGPU diff --git a/src/trans/gpu/internal/trmtol_mod.F90 b/src/trans/gpu/internal/trmtol_mod.F90 index 73d93f8c..11c027a7 100755 --- a/src/trans/gpu/internal/trmtol_mod.F90 +++ b/src/trans/gpu/internal/trmtol_mod.F90 @@ -23,7 +23,7 @@ MODULE TRMTOL_MOD FUNCTION PREPARE_TRMTOL(ALLOCATOR, KF_LEG) RESULT(HTRMTOL) USE PARKIND_ECTRANS, ONLY: JPIM, JPRBT, JPIB USE TPM_DISTR, ONLY: D - USE BUFFERED_ALLOCATOR_MOD, ONLY: BUFFERED_ALLOCATOR, RESERVE + USE BUFFERED_ALLOCATOR_MOD, ONLY: BUFFERED_ALLOCATOR, RESERVE_GAM USE ISO_C_BINDING, ONLY: C_SIZE_T, C_SIZEOF IMPLICIT NONE @@ -34,7 +34,7 @@ FUNCTION PREPARE_TRMTOL(ALLOCATOR, KF_LEG) RESULT(HTRMTOL) REAL(KIND=JPRBT) :: DUMMY - HTRMTOL%HPFBUF = RESERVE(ALLOCATOR, 2_JPIB*D%NLENGT0B*KF_LEG*C_SIZEOF(DUMMY), "HTRMTOL%HPFBUF") + HTRMTOL%HPFBUF = RESERVE_GAM(ALLOCATOR, 2_JPIB*D%NLENGT0B*KF_LEG*C_SIZEOF(DUMMY), "HTRMTOL%HPFBUF") END FUNCTION SUBROUTINE TRMTOL(ALLOCATOR,HTRMTOL,PFBUF_IN,PFBUF,KF_LEG) @@ -97,7 +97,7 @@ SUBROUTINE TRMTOL(ALLOCATOR,HTRMTOL,PFBUF_IN,PFBUF,KF_LEG) USE MPI_F08, ONLY: MPI_COMM, MPI_REAL4, MPI_REAL8 ! Missing: MPI_ALLTOALLV on purpose due to cray-mpi bug (see https://github.com/ecmwf-ifs/ectrans/pull/157) #endif - USE BUFFERED_ALLOCATOR_MOD, ONLY: BUFFERED_ALLOCATOR, ASSIGN_PTR, GET_ALLOCATION + USE BUFFERED_ALLOCATOR_MOD, ONLY: BUFFERED_ALLOCATOR, ASSIGN_PTR, GET_ALLOCATION_GAM USE TPM_STATS, ONLY: GSTATS => GSTATS_NVTX USE ISO_C_BINDING, ONLY: C_SIZE_T, C_SIZEOF USE ABORT_TRANS_MOD, ONLY: ABORT_TRANS @@ -135,7 +135,7 @@ SUBROUTINE TRMTOL(ALLOCATOR,HTRMTOL,PFBUF_IN,PFBUF,KF_LEG) IF (LHOOK) CALL DR_HOOK('TRMTOL',0,ZHOOK_HANDLE) - CALL ASSIGN_PTR(PFBUF, GET_ALLOCATION(ALLOCATOR, HTRMTOL%HPFBUF),& + CALL ASSIGN_PTR(PFBUF, GET_ALLOCATION_GAM(ALLOCATOR, HTRMTOL%HPFBUF),& & 1_JPIB, 2_JPIB*D%NLENGT0B*KF_LEG*C_SIZEOF(PFBUF(1))) IF(NPROC > 1) THEN diff --git a/src/trans/gpu/internal/trmtol_pack_unpack.F90 b/src/trans/gpu/internal/trmtol_pack_unpack.F90 index 76080145..ba038401 100755 --- a/src/trans/gpu/internal/trmtol_pack_unpack.F90 +++ b/src/trans/gpu/internal/trmtol_pack_unpack.F90 @@ -29,7 +29,7 @@ FUNCTION PREPARE_TRMTOL_PACK(ALLOCATOR,KF_LEG) RESULT(HTRMTOL_PACK) USE PARKIND_ECTRANS, ONLY: JPIM, JPRBT, JPIB USE TPM_DISTR, ONLY: D USE ISO_C_BINDING, ONLY: C_SIZE_T, C_SIZEOF - USE BUFFERED_ALLOCATOR_MOD, ONLY: BUFFERED_ALLOCATOR, RESERVE + USE BUFFERED_ALLOCATOR_MOD, ONLY: BUFFERED_ALLOCATOR, RESERVE_GAM IMPLICIT NONE @@ -43,7 +43,7 @@ FUNCTION PREPARE_TRMTOL_PACK(ALLOCATOR,KF_LEG) RESULT(HTRMTOL_PACK) REAL(KIND=JPRBT) :: ZPRBT_DUMMY IALLOC_SZ = 2_JPIB*D%NLENGT1B*KF_LEG*C_SIZEOF(ZPRBT_DUMMY) - HTRMTOL_PACK%HFOUBUF_IN = RESERVE(ALLOCATOR, IALLOC_SZ, "HTRMTOL_PACK%HFOUBUF_IN") + HTRMTOL_PACK%HFOUBUF_IN = RESERVE_GAM(ALLOCATOR, IALLOC_SZ, "HTRMTOL_PACK%HFOUBUF_IN") END FUNCTION SUBROUTINE TRMTOL_PACK(ALLOCATOR,HTRMTOL_PACK,ZOUTS,ZOUTA,ZOUTS0,ZOUTA0,FOUBUF_IN,KF_LEG) @@ -90,7 +90,7 @@ SUBROUTINE TRMTOL_PACK(ALLOCATOR,HTRMTOL_PACK,ZOUTS,ZOUTA,ZOUTS0,ZOUTA0,FOUBUF_I USE TPM_GEOMETRY, ONLY: G USE TPM_DISTR, ONLY: D USE LEINV_MOD, ONLY: LEINV_STRIDES - USE BUFFERED_ALLOCATOR_MOD, ONLY: BUFFERED_ALLOCATOR, ASSIGN_PTR, GET_ALLOCATION + USE BUFFERED_ALLOCATOR_MOD, ONLY: BUFFERED_ALLOCATOR, ASSIGN_PTR, GET_ALLOCATION_GAM USE ISO_C_BINDING, ONLY: C_SIZE_T, C_SIZEOF IMPLICIT NONE @@ -120,7 +120,7 @@ SUBROUTINE TRMTOL_PACK(ALLOCATOR,HTRMTOL_PACK,ZOUTS,ZOUTA,ZOUTS0,ZOUTA0,FOUBUF_I IF (LHOOK) CALL DR_HOOK('TRMTOL_PACK',0,ZHOOK_HANDLE) - CALL ASSIGN_PTR(FOUBUF_IN, GET_ALLOCATION(ALLOCATOR, HTRMTOL_PACK%HFOUBUF_IN),& + CALL ASSIGN_PTR(FOUBUF_IN, GET_ALLOCATION_GAM(ALLOCATOR, HTRMTOL_PACK%HFOUBUF_IN),& & 1_JPIB, 2_JPIB*D%NLENGT1B*KF_LEG*C_SIZEOF(FOUBUF_IN(1))) CALL LEINV_STRIDES(KF_LEG,IOUT_STRIDES0=IOUT_STRIDES0,IOUT_SIZE=IOUT_SIZE,&