Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

MI300A Race Condition Fix #185

Open
wants to merge 3 commits into
base: develop
Choose a base branch
from
Open
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
91 changes: 44 additions & 47 deletions src/trans/gpu/internal/trgtol_mod.F90
Original file line number Diff line number Diff line change
Expand Up @@ -168,9 +168,8 @@ SUBROUTINE TRGTOL(ALLOCATOR,HTRGTOL,PREEL_REAL,KF_FS,KF_GP,KF_UV_G,KF_SCALARS_G,
INTEGER(KIND=JPIM) :: ISEND_FIELD_COUNT(NPRTRV),ISEND_FIELD_COUNT_V
INTEGER(KIND=JPIM) :: ISEND_WSET_SIZE(NPRTRW),ISEND_WSET_SIZE_V
INTEGER(KIND=JPIM) :: ISEND_WSET_OFFSET(NPRTRW+1), ISEND_WSET_OFFSET_V
INTEGER(KIND=JPIB), ALLOCATABLE :: ICOMBUFS_OFFSET(:),ICOMBUFR_OFFSET(:)
INTEGER(KIND=JPIB), ALLOCATABLE :: ICOMBUFS_OFFSET(:),ICOMBUFR_OFFSET(:), IFLDA(:,:)
INTEGER(KIND=JPIB) :: ICOMBUFS_OFFSET_V, ICOMBUFR_OFFSET_V
INTEGER(KIND=JPIM) :: IFLDA(KF_GP)
INTEGER(KIND=JPIM) :: IVSET(KF_GP)

REAL(KIND=JPHOOK) :: ZHOOK_HANDLE
Expand Down Expand Up @@ -476,46 +475,71 @@ SUBROUTINE TRGTOL(ALLOCATOR,HTRGTOL,PREEL_REAL,KF_FS,KF_GP,KF_UV_G,KF_SCALARS_G,
#endif

CALL GSTATS(1602,0)
! Allocate this buffer. Add 1 for the potential self sends
ALLOCATE(IFLDA(KF_GP,1+ISEND_COUNTS))

IF(LLOCAL_CONTRIBUTION)THEN
! I have to send something to myself...

! Input is KF_GP fields. We find the resulting KF_FS fields.
IFLDS = 0
DO JFLD=1,KF_GP
IF(IVSET(JFLD) == MYSETV .OR. IVSET(JFLD) == -1) THEN
IFLDS = IFLDS+1
IF(PRESENT(KPTRGP)) THEN
IFLDA(IFLDS,1) = KPTRGP(JFLD)
ELSE
IFLDA(IFLDS,1) = JFLD
ENDIF
ENDIF
ENDDO
ENDIF
DO INS=1,ISEND_COUNTS
ISEND=ISEND_TO_PROC(INS)
CALL PE2SET(ISEND,ISETA,ISETB,ISETW,ISETV)

ISEND_FIELD_COUNT_V = ISEND_FIELD_COUNT(ISETV)
ICOMBUFS_OFFSET_V = ICOMBUFS_OFFSET(INS)


IFLDS = 0
DO JFLD=1,KF_GP
IF(IVSET(JFLD) == ISETV .OR. IVSET(JFLD) == -1 ) THEN
IFLDS = IFLDS+1
IF(PRESENT(KPTRGP)) THEN
IFLDA(IFLDS)=KPTRGP(JFLD)
IFLDA(IFLDS,1+INS)=KPTRGP(JFLD)
ELSE
IFLDA(IFLDS)=JFLD
IFLDA(IFLDS,1+INS)=JFLD
ENDIF
ENDIF
ENDDO
ENDDO

#ifdef OMPGPU
#endif
#ifdef ACCGPU
!$ACC DATA COPYIN(IFLDA(1:ISEND_FIELD_COUNT_V)) ASYNC(1)
!$ACC DATA COPYIN(IFLDA) ASYNC(1)
#endif

DO INS=1,ISEND_COUNTS
ISEND=ISEND_TO_PROC(INS)
CALL PE2SET(ISEND,ISETA,ISETB,ISETW,ISETV)

ISEND_FIELD_COUNT_V = ISEND_FIELD_COUNT(ISETV)
ICOMBUFS_OFFSET_V = ICOMBUFS_OFFSET(INS)

ISEND_WSET_OFFSET_V = ISEND_WSET_OFFSET(ISETW)
ISEND_WSET_SIZE_V = ISEND_WSET_SIZE(ISETW)
IF(PRESENT(PGP)) THEN
#ifdef OMPGPU
#endif
#ifdef ACCGPU
!$ACC PARALLEL LOOP COLLAPSE(2) DEFAULT(NONE) PRIVATE(JK,JBLK,IFLD,JI) &
!$ACC& FIRSTPRIVATE(ISEND_FIELD_COUNT_V,ISEND_WSET_SIZE_V,ISEND_WSET_OFFSET_V,&
!$ACC& FIRSTPRIVATE(INS,ISEND_FIELD_COUNT_V,ISEND_WSET_SIZE_V,ISEND_WSET_OFFSET_V,&
!$ACC& ICOMBUFS_OFFSET_V,NPROMA) ASYNC(1)
#endif
DO JFLD=1,ISEND_FIELD_COUNT_V
DO JL=1,ISEND_WSET_SIZE_V
JK = MOD(ISEND_WSET_OFFSET_V+JL-1,NPROMA)+1
JBLK = (ISEND_WSET_OFFSET_V+JL-1)/NPROMA+1
IFLD = IFLDA(JFLD)
IFLD = IFLDA(JFLD,1+INS)
JI = (JFLD-1)*ISEND_WSET_SIZE_V+JL
ZCOMBUFS(ICOMBUFS_OFFSET_V+JI) = PGP(JK,IFLD,JBLK)
ENDDO
Expand All @@ -525,14 +549,14 @@ SUBROUTINE TRGTOL(ALLOCATOR,HTRGTOL,PREEL_REAL,KF_FS,KF_GP,KF_UV_G,KF_SCALARS_G,
#endif
#ifdef ACCGPU
!$ACC PARALLEL LOOP COLLAPSE(2) DEFAULT(NONE) PRIVATE(JK,JBLK,IFLD,JI,IOFF,PBOUND) &
!$ACC& FIRSTPRIVATE(ISEND_FIELD_COUNT_V,ISEND_WSET_SIZE_V,ISEND_WSET_OFFSET_V,&
!$ACC& FIRSTPRIVATE(INS,ISEND_FIELD_COUNT_V,ISEND_WSET_SIZE_V,ISEND_WSET_OFFSET_V,&
!$ACC& ICOMBUFS_OFFSET_V,NPROMA) ASYNC(1)
#endif
DO JFLD=1,ISEND_FIELD_COUNT_V
DO JL=1,ISEND_WSET_SIZE_V
JK = MOD(ISEND_WSET_OFFSET_V+JL-1,NPROMA)+1
JBLK = (ISEND_WSET_OFFSET_V+JL-1)/NPROMA+1
IFLD = IFLDA(JFLD)
IFLD = IFLDA(JFLD,1+INS)
JI = ICOMBUFS_OFFSET_V+(JFLD-1)*ISEND_WSET_SIZE_V+JL
IF(IFLD < PGP_INDICES(PGP_INDICES_UV+1)) THEN
IOFF=IFLD-PGP_INDICES(PGP_INDICES_UV)
Expand All @@ -554,15 +578,13 @@ SUBROUTINE TRGTOL(ALLOCATOR,HTRGTOL,PREEL_REAL,KF_FS,KF_GP,KF_UV_G,KF_SCALARS_G,
ENDDO
ENDDO
ENDIF
ENDDO
#ifdef OMPGPU
#endif
#ifdef ACCGPU
!$ACC END DATA
#endif
ENDDO
#ifdef ACCGPU
!$ACC WAIT(1)
#endif

CALL GSTATS(1602,1)

IF (LSYNC_TRANS) THEN
Expand Down Expand Up @@ -634,27 +656,6 @@ SUBROUTINE TRGTOL(ALLOCATOR,HTRGTOL,PREEL_REAL,KF_FS,KF_GP,KF_UV_G,KF_SCALARS_G,

! Copy local contribution
IF(LLOCAL_CONTRIBUTION)THEN
! I have to send something to myself...

! Input is KF_GP fields. We find the resulting KF_FS fields.
IFLDS = 0
DO JFLD=1,KF_GP
IF(IVSET(JFLD) == MYSETV .OR. IVSET(JFLD) == -1) THEN
IFLDS = IFLDS+1
IF(PRESENT(KPTRGP)) THEN
IFLDA(IFLDS) = KPTRGP(JFLD)
ELSE
IFLDA(IFLDS) = JFLD
ENDIF
ENDIF
ENDDO

#ifdef OMPGPU
#endif
#ifdef ACCGPU
!$ACC DATA COPYIN(IFLDA(1:IFLDS)) ASYNC(1)
#endif

ISEND_WSET_OFFSET_V = ISEND_WSET_OFFSET(MYSETW)
ISEND_WSET_SIZE_V = ISEND_WSET_SIZE(MYSETW)
IRECV_BUFR_TO_OUT_V = IRECV_BUFR_TO_OUT_OFFSET(MYPROC)
Expand All @@ -671,7 +672,7 @@ SUBROUTINE TRGTOL(ALLOCATOR,HTRGTOL,PREEL_REAL,KF_FS,KF_GP,KF_UV_G,KF_SCALARS_G,
DO JL=1,ISEND_WSET_SIZE_V
JK = MOD(ISEND_WSET_OFFSET_V+JL-1,NPROMA)+1
JBLK = (ISEND_WSET_OFFSET_V+JL-1)/NPROMA+1
IFLD = IFLDA(JFLD)
IFLD = IFLDA(JFLD,1)
IPOS = IRECV_BUFR_TO_OUT(IRECV_BUFR_TO_OUT_V+JL,1)+ &
& (JFLD-1)*IRECV_BUFR_TO_OUT(IRECV_BUFR_TO_OUT_V+JL,2)+1
PREEL_REAL(IPOS) = PGP(JK,IFLD,JBLK)
Expand All @@ -689,7 +690,7 @@ SUBROUTINE TRGTOL(ALLOCATOR,HTRGTOL,PREEL_REAL,KF_FS,KF_GP,KF_UV_G,KF_SCALARS_G,
DO JL=1,ISEND_WSET_SIZE_V
JK = MOD(ISEND_WSET_OFFSET_V+JL-1,NPROMA)+1
JBLK = (ISEND_WSET_OFFSET_V+JL-1)/NPROMA+1
IFLD = IFLDA(JFLD)
IFLD = IFLDA(JFLD,1)
IPOS = IRECV_BUFR_TO_OUT(IRECV_BUFR_TO_OUT_V+JL,1)+ &
& (JFLD-1)*IRECV_BUFR_TO_OUT(IRECV_BUFR_TO_OUT_V+JL,2)+1
IF(IFLD < PGP_INDICES(PGP_INDICES_UV+1)) THEN
Expand All @@ -712,16 +713,8 @@ SUBROUTINE TRGTOL(ALLOCATOR,HTRGTOL,PREEL_REAL,KF_FS,KF_GP,KF_UV_G,KF_SCALARS_G,
ENDDO
ENDIF
CALL GSTATS(1601,1)

#ifdef OMPGPU
#endif
#ifdef ACCGPU
!$ACC END DATA
#endif

ENDIF


IF(IR > 0) THEN
CALL MPL_WAIT(KREQUEST=IREQ(1:IR), &
& CDSTRING='TRGTOL: WAIT FOR SENDS AND RECEIVES')
Expand Down Expand Up @@ -774,6 +767,7 @@ SUBROUTINE TRGTOL(ALLOCATOR,HTRGTOL,PREEL_REAL,KF_FS,KF_GP,KF_UV_G,KF_SCALARS_G,
#endif
#ifdef ACCGPU
!$ACC END DATA ! ZCOMBUFR
!$ACC END DATA ! IFLDA
!$ACC END DATA ! IRECV_BUFR_TO_OUT
!$ACC END DATA ! PGPINDICES
!$ACC END DATA !ZCOMBUFS (present)
Expand All @@ -785,6 +779,9 @@ SUBROUTINE TRGTOL(ALLOCATOR,HTRGTOL,PREEL_REAL,KF_FS,KF_GP,KF_UV_G,KF_SCALARS_G,
#endif
IF (ACC_POINTERS_CNT > 0) CALL EXT_ACC_DELETE(ACC_POINTERS(1:ACC_POINTERS_CNT),STREAM=1_ACC_HANDLE_KIND)

! Free this now
DEALLOCATE(IFLDA)

IF (LHOOK) CALL DR_HOOK('TRGTOL',1,ZHOOK_HANDLE)
END SUBROUTINE TRGTOL
END MODULE TRGTOL_MOD
Loading