From b73d2c951c07171f955c551d719b95535854b546 Mon Sep 17 00:00:00 2001 From: Ahmad Nawab Date: Mon, 18 Nov 2024 11:21:26 +0000 Subject: [PATCH 1/9] PROPAG_WAM: port code paths for depth and current refraction to GPU --- src/ecwam/ctuwupdt.F90 | 10 +++++----- src/ecwam/gradi.F90 | 14 ++++++++++++++ src/ecwam/propag_wam.F90 | 15 ++++++++++----- src/ecwam/propags2.F90 | 8 ++++---- src/ecwam/propdot.F90 | 12 ++++++++++++ src/ecwam/yowubuf.F90 | 7 +++++++ 6 files changed, 52 insertions(+), 14 deletions(-) diff --git a/src/ecwam/ctuwupdt.F90 b/src/ecwam/ctuwupdt.F90 index af657d692..da41a2df0 100644 --- a/src/ecwam/ctuwupdt.F90 +++ b/src/ecwam/ctuwupdt.F90 @@ -257,7 +257,7 @@ SUBROUTINE CTUWUPDT (IJS, IJL, NINF, NSUP, & IF (IREFRA == 2 .OR. IREFRA == 3) THEN - !$acc parallel loop independent collapse(4) + !$acc parallel loop independent collapse(4) present(WLATN,LLWLATN) DO ICL=1,2 DO IC=1,2 DO K=1,NANG @@ -276,7 +276,7 @@ SUBROUTINE CTUWUPDT (IJS, IJL, NINF, NSUP, & ENDDO !$acc end parallel - !$acc parallel loop independent collapse(3) + !$acc parallel loop independent collapse(3) present(WLONN,LLWLONN) DO IC=1,2 DO M=1,NFRE_RED DO K=1,NANG @@ -293,7 +293,7 @@ SUBROUTINE CTUWUPDT (IJS, IJL, NINF, NSUP, & ENDDO !$acc end parallel - !$acc parallel loop independent collapse(4) + !$acc parallel loop independent collapse(4) present(WCORN,LLWCORN) DO ICL=1,2 DO ICR=1,4 DO M=1,NFRE_RED @@ -312,7 +312,7 @@ SUBROUTINE CTUWUPDT (IJS, IJL, NINF, NSUP, & ENDDO !$acc end parallel - !$acc parallel loop independent collapse(3) + !$acc parallel loop independent collapse(3) present(WKPMN,LLWKPMN) DO IC=-1,1 DO M=1,NFRE_RED DO K=1,NANG @@ -329,7 +329,7 @@ SUBROUTINE CTUWUPDT (IJS, IJL, NINF, NSUP, & ENDDO !$acc end parallel - !$acc parallel loop independent collapse(3) + !$acc parallel loop independent collapse(3) present(WMPMN,LLWMPMN) DO IC=-1,1 DO M=1,NFRE_RED DO K=1,NANG diff --git a/src/ecwam/gradi.F90 b/src/ecwam/gradi.F90 index 36c196aac..fda353184 100644 --- a/src/ecwam/gradi.F90 +++ b/src/ecwam/gradi.F90 @@ -107,6 +107,8 @@ SUBROUTINE GRADI (KIJS, KIJL, NINF, NSUP, IREFRA, & !* 1. INITIALISE. ! ----------- +!$acc data present(KLAT,WLAT,DPTHEXT) copyin(DELLAM) + NLAND=NSUP+1 ONEO2DELPHI = 0.5_JWRB/DELPHI @@ -116,6 +118,7 @@ SUBROUTINE GRADI (KIJS, KIJL, NINF, NSUP, IREFRA, & ! -------------------------- IF (IREFRA == 1 .OR. IREFRA == 3) THEN + !$acc kernels DO IJ=KIJS,KIJL IPP = KLAT(IJ,2,1) IPM = KLAT(IJ,1,1) @@ -146,11 +149,14 @@ SUBROUTINE GRADI (KIJS, KIJL, NINF, NSUP, IREFRA, & DDLAM(IJ) = 0.0_JWRB ENDIF ENDDO + !$acc end kernels ELSE + !$acc kernels DO IJ=KIJS,KIJL DDPHI(IJ) = 0.0_JWRB DDLAM(IJ) = 0.0_JWRB ENDDO + !$acc end kernels ENDIF ! ---------------------------------------------------------------------- @@ -159,6 +165,7 @@ SUBROUTINE GRADI (KIJS, KIJL, NINF, NSUP, IREFRA, & ! ------------------------------------- IF (IREFRA == 2 .OR. IREFRA == 3) THEN + !$acc kernels DO IJ=KIJS,KIJL IPP = KLAT(IJ,2,1) ! exact 0 means that the current field was not defined, hence @@ -206,7 +213,9 @@ SUBROUTINE GRADI (KIJS, KIJL, NINF, NSUP, IREFRA, & DVLAM(IJ) = 0.0_JWRB ENDIF ENDDO + !$acc end kernels + !$acc kernels DO IJ=KIJS,KIJL KX = BLK2GLO%KXLT(IJ) CGMAX = CURRENT_GRADIENT_MAX*COSPH(KX) @@ -215,16 +224,21 @@ SUBROUTINE GRADI (KIJS, KIJL, NINF, NSUP, IREFRA, & DULAM(IJ) = SIGN(MIN(ABS(DULAM(IJ)),CGMAX),DULAM(IJ)) DVLAM(IJ) = SIGN(MIN(ABS(DVLAM(IJ)),CGMAX),DVLAM(IJ)) ENDDO + !$acc end kernels ELSE + !$acc kernels DO IJ=KIJS,KIJL DUPHI(IJ) = 0.0_JWRB DVPHI(IJ) = 0.0_JWRB DULAM(IJ) = 0.0_JWRB DVLAM(IJ) = 0.0_JWRB ENDDO + !$acc end kernels ENDIF +!$acc end data + IF (LHOOK) CALL DR_HOOK('GRADI',1,ZHOOK_HANDLE) END SUBROUTINE GRADI diff --git a/src/ecwam/propag_wam.F90 b/src/ecwam/propag_wam.F90 index 518fe5d74..7c2b82d32 100644 --- a/src/ecwam/propag_wam.F90 +++ b/src/ecwam/propag_wam.F90 @@ -117,7 +117,7 @@ SUBROUTINE PROPAG_WAM (BLK2GLO, WAVNUM, CGROUP, OMOSNH2KD, FL1, & !!! the advection schemes are still written in block structure !!! mapping chuncks to block ONLY for actual grid points !!!! #ifdef _OPENACC - !$acc kernels loop independent private(KIJS, IJSB, KIJL, IJLB) + !$acc parallel loop private(KIJS, IJSB, KIJL, IJLB) #else !$OMP PARALLEL DO SCHEDULE(STATIC) PRIVATE(ICHNK, KIJS, IJSB, KIJL, IJLB, M, K) #endif /*_OPENACC*/ @@ -134,7 +134,7 @@ SUBROUTINE PROPAG_WAM (BLK2GLO, WAVNUM, CGROUP, OMOSNH2KD, FL1, & ENDDO ENDDO #ifdef _OPENACC - !$acc end kernels + !$acc end parallel loop #else !$OMP END PARALLEL DO #endif /*_OPENACC*/ @@ -171,9 +171,6 @@ SUBROUTINE PROPAG_WAM (BLK2GLO, WAVNUM, CGROUP, OMOSNH2KD, FL1, & ! --------------------- IF (LLUPDTTD) THEN -#ifdef _OPENACC - CALL WAM_ABORT("PROPAG_WAM: BRANCH NOT YET PORTED FOR GPU EXECUTION") -#endif IF (.NOT.ALLOCATED(THDC)) ALLOCATE(THDC(IJSG:IJLG, NANG)) IF (.NOT.ALLOCATED(THDD)) ALLOCATE(THDD(IJSG:IJLG, NANG)) IF (.NOT.ALLOCATED(SDOT)) ALLOCATE(SDOT(IJSG:IJLG, NANG, NFRE_RED)) @@ -187,7 +184,11 @@ SUBROUTINE PROPAG_WAM (BLK2GLO, WAVNUM, CGROUP, OMOSNH2KD, FL1, & ! DOT THETA TERM: +#ifdef _OPENACC + !$acc data create(THDC,THDD,SDOT) present(BUFFER_EXT,BLK2GLO) +#else !$OMP PARALLEL DO SCHEDULE(DYNAMIC,1) PRIVATE(JKGLO, KIJS, KIJL) +#endif DO JKGLO = IJSG, IJLG, NPROMA KIJS=JKGLO KIJL=MIN(KIJS+NPROMA-1, IJLG) @@ -199,7 +200,11 @@ SUBROUTINE PROPAG_WAM (BLK2GLO, WAVNUM, CGROUP, OMOSNH2KD, FL1, & & BUFFER_EXT(:,3*NFRE_RED+4), BUFFER_EXT(:,3*NFRE_RED+5), & & THDC(KIJS:KIJL,:), THDD(KIJS:KIJL,:), SDOT(KIJS:KIJL,:,:)) ENDDO +#ifdef _OPENACC + !$acc end data +#else !$OMP END PARALLEL DO +#endif LLUPDTTD = .FALSE. ENDIF diff --git a/src/ecwam/propags2.F90 b/src/ecwam/propags2.F90 index c1f52c5b9..133569576 100644 --- a/src/ecwam/propags2.F90 +++ b/src/ecwam/propags2.F90 @@ -96,7 +96,7 @@ SUBROUTINE PROPAGS2 (F1, F3, NINF, NSUP, KIJS, KIJL, NANG, ND3SF1, ND3EF1, ND3S, !* WITHOUT DEPTH OR/AND CURRENT REFRACTION. ! ---------------------------------------- - !$acc kernels loop present(F1,F3,KLON,KLAT,KCOR,SUMWN,WLONN,WLATN,WCORN,JXO,JYO,KCR,WKPMN,LLWKPMN,KPM) + !$acc kernels present(F1,F3,KLON,KLAT,KCOR,SUMWN,WLONN,WLATN,WCORN,JXO,JYO,KCR,WKPMN,KPM) DO K = 1, NANG DO M = ND3S, ND3E @@ -122,10 +122,9 @@ SUBROUTINE PROPAGS2 (F1, F3, NINF, NSUP, KIJS, KIJL, NANG, ND3SF1, ND3EF1, ND3S, ELSE !* DEPTH AND CURRENT REFRACTION. ! ----------------------------- -#ifdef _OPENACC - CALL WAM_ABORT("PROPAGS2: BRANCH NOT YET PORTED FOR GPU EXECUTION") -#endif + !$acc kernels present(F1,F3,SUMWN,WLONN,KLON,WLATN,KLAT,WCORN,KCOR, & + !$acc & LLWKPMN,WKPMN,KPM,LLWMPMN,WMPMN,MPM) DO M = ND3S, ND3E DO K = 1, NANG @@ -177,6 +176,7 @@ SUBROUTINE PROPAGS2 (F1, F3, NINF, NSUP, KIJS, KIJL, NANG, ND3SF1, ND3EF1, ND3S, ENDDO ENDDO + !$acc end kernels ENDIF diff --git a/src/ecwam/propdot.F90 b/src/ecwam/propdot.F90 index 9d94c317b..587dcacad 100644 --- a/src/ecwam/propdot.F90 +++ b/src/ecwam/propdot.F90 @@ -101,6 +101,8 @@ SUBROUTINE PROPDOT(KIJS, KIJL, NINF, NSUP, & IF (LHOOK) CALL DR_HOOK('PROPDOT',0,ZHOOK_HANDLE) + !$acc data create(DDPHI,DDLAM,DUPHI,DULAM,DVPHI,DVLAM,DCO,OMDD) & + !$acc & present(COSPHM1_EXT,U_EXT,V_EXT,SINTH,COSTH) !* 2.2 DEPTH AND CURRENT GRADIENTS. ! ---------------------------- @@ -114,6 +116,7 @@ SUBROUTINE PROPDOT(KIJS, KIJL, NINF, NSUP, & !* 2.3 COSINE OF LATITUDES IF SPHERICAL PROPAGATION. ! --------------------------------------------- + !$acc kernels IF (ICASE == 1) THEN DO IJ = KIJS,KIJL DCO(IJ) = COSPHM1_EXT(IJ) @@ -136,11 +139,13 @@ SUBROUTINE PROPDOT(KIJS, KIJL, NINF, NSUP, & OMDD(IJ) = 0.0_JWRB ENDDO ENDIF + !$acc end kernels !* 2.5. LOOP OVER DIRECTIONS. ! --------------------- + !$acc parallel loop DO K=1,NANG SD = SINTH(K) CD = COSTH(K) @@ -149,10 +154,12 @@ SUBROUTINE PROPDOT(KIJS, KIJL, NINF, NSUP, & ! ---------------------------- IF (IREFRA == 1 .OR. IREFRA == 3) THEN + !$acc loop DO IJ = KIJS,KIJL THDD(IJ,K) = SD*DDPHI(IJ) - CD*DDLAM(IJ)*DCO(IJ) ENDDO ELSE + !$acc loop DO IJ = KIJS,KIJL THDD(IJ,K) = 0.0_JWRB ENDDO @@ -166,6 +173,7 @@ SUBROUTINE PROPDOT(KIJS, KIJL, NINF, NSUP, & SS = SD**2 SC = SD*CD CC = CD**2 + !$acc loop DO IJ = KIJS,KIJL SDOT(IJ,K,NFRE_RED) = -SC*DUPHI(IJ) - CC*DVPHI(IJ) & & - (SS*DULAM(IJ) + SC*DVLAM(IJ))*DCO(IJ) @@ -176,6 +184,7 @@ SUBROUTINE PROPDOT(KIJS, KIJL, NINF, NSUP, & !* 2.5.3 LOOP OVER FREQUENCIES. ! ---------------------- + !$acc loop independent collapse(2) DO M=1,NFRE_RED DO IJ=KIJS,KIJL SDOT(IJ,K,M) = (SDOT(IJ,K,NFRE_RED)*CGROUP_EXT(IJ,M) & @@ -187,6 +196,9 @@ SUBROUTINE PROPDOT(KIJS, KIJL, NINF, NSUP, & !* BRANCH BACK TO 2.5 FOR NEXT DIRECTION. ENDDO + !$acc end parallel loop + + !$acc end data IF (LHOOK) CALL DR_HOOK('PROPDOT',1,ZHOOK_HANDLE) diff --git a/src/ecwam/yowubuf.F90 b/src/ecwam/yowubuf.F90 index cab00362a..96f617e86 100644 --- a/src/ecwam/yowubuf.F90 +++ b/src/ecwam/yowubuf.F90 @@ -187,4 +187,11 @@ MODULE YOWUBUF !$acc declare create(SUMWN) !$acc declare create(LLWKPMN) !$acc declare create(KPM) +!$acc declare create(MPM) +!$acc declare create(WMPMN) +!$acc declare create(LLWLATN) +!$acc declare create(LLWLONN) +!$acc declare create(LLWCORN) +!$acc declare create(LLWKPMN) +!$acc declare create(LLWMPMN) END MODULE YOWUBUF From 922573f0eed186a9efe6703789f3eb359f106231 Mon Sep 17 00:00:00 2001 From: Ahmad Nawab Date: Mon, 18 Nov 2024 19:59:34 +0000 Subject: [PATCH 2/9] Don't allocate flags used to mask PROPAGS2 on GPU --- src/ecwam/ctuwupdt.F90 | 19 ++++--------------- src/ecwam/yowubuf.F90 | 6 ------ 2 files changed, 4 insertions(+), 21 deletions(-) diff --git a/src/ecwam/ctuwupdt.F90 b/src/ecwam/ctuwupdt.F90 index da41a2df0..dcd69c4ae 100644 --- a/src/ecwam/ctuwupdt.F90 +++ b/src/ecwam/ctuwupdt.F90 @@ -175,11 +175,13 @@ SUBROUTINE CTUWUPDT (IJS, IJL, NINF, NSUP, & IF (IREFRA == 2 .OR. IREFRA == 3) THEN IF (.NOT. ALLOCATED(WMPMN)) ALLOCATE(WMPMN(IJS:IJL,NANG,NFRE_RED,-1:1)) +#ifndef _OPENACC IF (.NOT. ALLOCATED(LLWLATN)) ALLOCATE(LLWLATN(NANG,NFRE_RED,2,2)) IF (.NOT. ALLOCATED(LLWLONN)) ALLOCATE(LLWLONN(NANG,NFRE_RED,2)) IF (.NOT. ALLOCATED(LLWCORN)) ALLOCATE(LLWCORN(NANG,NFRE_RED,4,2)) IF (.NOT. ALLOCATED(LLWKPMN)) ALLOCATE(LLWKPMN(NANG,NFRE_RED,-1:1)) IF (.NOT. ALLOCATED(LLWMPMN)) ALLOCATE(LLWMPMN(NANG,NFRE_RED,-1:1)) +#endif ENDIF @@ -255,15 +257,14 @@ SUBROUTINE CTUWUPDT (IJS, IJL, NINF, NSUP, & ! FIND THE LOGICAL FLAGS THAT WILL LIMIT THE EXTEND OF THE CALCULATION IN PROPAGS2 ! IN CASE REFRACTION IS USED +#ifndef _OPENACC IF (IREFRA == 2 .OR. IREFRA == 3) THEN - !$acc parallel loop independent collapse(4) present(WLATN,LLWLATN) DO ICL=1,2 DO IC=1,2 DO K=1,NANG DO M=1,NFRE_RED LLWLATN(K,M,IC,ICL)=.FALSE. - !$acc loop DO IJ=IJS,IJL IF (WLATN(IJ,K,M,IC,ICL) > 0.0_JWRB) THEN LLWLATN(K,M,IC,ICL)=.TRUE. @@ -274,14 +275,11 @@ SUBROUTINE CTUWUPDT (IJS, IJL, NINF, NSUP, & ENDDO ENDDO ENDDO - !$acc end parallel - !$acc parallel loop independent collapse(3) present(WLONN,LLWLONN) DO IC=1,2 DO M=1,NFRE_RED DO K=1,NANG LLWLONN(K,M,IC)=.FALSE. - !$acc loop DO IJ=IJS,IJL IF (WLONN(IJ,K,M,IC) > 0.0_JWRB) THEN LLWLONN(K,M,IC)=.TRUE. @@ -291,15 +289,12 @@ SUBROUTINE CTUWUPDT (IJS, IJL, NINF, NSUP, & ENDDO ENDDO ENDDO - !$acc end parallel - !$acc parallel loop independent collapse(4) present(WCORN,LLWCORN) DO ICL=1,2 DO ICR=1,4 DO M=1,NFRE_RED DO K=1,NANG LLWCORN(K,M,ICR,ICL)=.FALSE. - !$acc loop DO IJ=IJS,IJL IF (WCORN(IJ,K,M,ICR,ICL) > 0.0_JWRB) THEN LLWCORN(K,M,ICR,ICL)=.TRUE. @@ -310,14 +305,11 @@ SUBROUTINE CTUWUPDT (IJS, IJL, NINF, NSUP, & ENDDO ENDDO ENDDO - !$acc end parallel - !$acc parallel loop independent collapse(3) present(WKPMN,LLWKPMN) DO IC=-1,1 DO M=1,NFRE_RED DO K=1,NANG LLWKPMN(K,M,IC)=.FALSE. - !$acc loop DO IJ=IJS,IJL IF (WKPMN(IJ,K,M,IC) > 0.0_JWRB) THEN LLWKPMN(K,M,IC)=.TRUE. @@ -327,14 +319,11 @@ SUBROUTINE CTUWUPDT (IJS, IJL, NINF, NSUP, & ENDDO ENDDO ENDDO - !$acc end parallel - !$acc parallel loop independent collapse(3) present(WMPMN,LLWMPMN) DO IC=-1,1 DO M=1,NFRE_RED DO K=1,NANG LLWMPMN(K,M,IC)=.FALSE. - !$acc loop DO IJ=IJS,IJL IF (WMPMN(IJ,K,M,IC) > 0.0_JWRB) THEN LLWMPMN(K,M,IC)=.TRUE. @@ -344,9 +333,9 @@ SUBROUTINE CTUWUPDT (IJS, IJL, NINF, NSUP, & ENDDO ENDDO ENDDO - !$acc end parallel ENDIF +#endif IF (ALLOCATED(THDD)) DEALLOCATE(THDD) IF (ALLOCATED(THDC)) DEALLOCATE(THDC) diff --git a/src/ecwam/yowubuf.F90 b/src/ecwam/yowubuf.F90 index 96f617e86..20387d8f8 100644 --- a/src/ecwam/yowubuf.F90 +++ b/src/ecwam/yowubuf.F90 @@ -185,13 +185,7 @@ MODULE YOWUBUF !$acc declare create(WLONN) !$acc declare create(WKPMN) !$acc declare create(SUMWN) -!$acc declare create(LLWKPMN) !$acc declare create(KPM) !$acc declare create(MPM) !$acc declare create(WMPMN) -!$acc declare create(LLWLATN) -!$acc declare create(LLWLONN) -!$acc declare create(LLWCORN) -!$acc declare create(LLWKPMN) -!$acc declare create(LLWMPMN) END MODULE YOWUBUF From 0b268ac5561188816a6f2e8212511d38d5f8cbef Mon Sep 17 00:00:00 2001 From: Ahmad Nawab Date: Mon, 18 Nov 2024 20:07:25 +0000 Subject: [PATCH 3/9] PROPAGS2: use Loki to preprocess depth and current refraction code path --- src/ecwam/CMakeLists.txt | 13 +++++++ src/ecwam/ecwam_propags2_loki.config | 57 ++++++++++++++++++++++++++++ src/ecwam/propags2.F90 | 18 +++++++-- 3 files changed, 85 insertions(+), 3 deletions(-) create mode 100644 src/ecwam/ecwam_propags2_loki.config diff --git a/src/ecwam/CMakeLists.txt b/src/ecwam/CMakeLists.txt index 5a5f34d31..9c67734ed 100644 --- a/src/ecwam/CMakeLists.txt +++ b/src/ecwam/CMakeLists.txt @@ -519,6 +519,19 @@ if( HAVE_LOKI ) target_compile_definitions( ${ecwam} PRIVATE WAM_GPU ) endif() + if( NOT LOKI_MODE MATCHES "idem|idem-stack") + # Preprocess propags2.F90 for GPU enabled runs using Loki + loki_transform_target( TARGET ${ecwam} + MODE "idem" + DIRECTIVE openacc + FRONTEND ${LOKI_FRONTEND} + CONFIG ${CMAKE_CURRENT_SOURCE_DIR}/ecwam_propags2_loki.config + PLAN ${CMAKE_CURRENT_BINARY_DIR}/loki_propags2_plan_ecwam.cmake + SOURCES + ${CMAKE_CURRENT_SOURCE_DIR}/ + ) + endif() + # Apply Loki source file transformation to lib target loki_transform_target( TARGET ${ecwam} MODE ${LOKI_MODE} diff --git a/src/ecwam/ecwam_propags2_loki.config b/src/ecwam/ecwam_propags2_loki.config new file mode 100644 index 000000000..c96de842f --- /dev/null +++ b/src/ecwam/ecwam_propags2_loki.config @@ -0,0 +1,57 @@ +[default] +mode = "idem" +role = "kernel" +expand = true +strict = true +replicate = true + +# Utility calls and IO statements to remove +[transformations.RemoveCodeTransformation] + module = "loki.transformations" +[transformations.RemoveCodeTransformation.options] + remove_dead_code = true + kernel_only = true + +# Loop transformations +[transformations.LoopUnrollTransformation] + module = "loki.transformations" + classname = "TransformLoopsTransformation" +[transformations.LoopUnrollTransformation.options] + loop_unroll = true + +[transformations.LoopFuseTransformation] + module = "loki.transformations" + classname = "TransformLoopsTransformation" +[transformations.LoopFuseTransformation.options] + loop_fusion = true + +# SubstituteExpressionTransformation +[transformations.SubstituteExpressionTransformation] + module = "loki.transformations" +[transformations.SubstituteExpressionTransformation.options] + substitute_expressions = true + substitute_body = true +[transformations.SubstituteExpressionTransformation.options.expression_map] + "llwlonn(k,m,ic)" = ".true." + "llwlatn(k,m,ic,icl)" = ".true." + "llwcorn(k,m,icr,icl)" = ".true." + "llwkpmn(k,m,ic)" = ".true." + "llwmpmn(k,m,ic)" = ".true." + +# Idem transformation +[transformations.IdemTransformation] + module = "loki.transformations" + +# loki pipelines +[pipelines.idem] + transformations = [ + 'SubstituteExpressionTransformation', 'RemoveCodeTransformation', 'LoopUnrollTransformation', + 'LoopFuseTransformation', 'IdemTransformation' +] + +# Define entry point for call-tree transformation +[routines.propags2] + role = "kernel" + expand = false + replicate = false + \ No newline at end of file diff --git a/src/ecwam/propags2.F90 b/src/ecwam/propags2.F90 index 133569576..2a7ca9c05 100644 --- a/src/ecwam/propags2.F90 +++ b/src/ecwam/propags2.F90 @@ -123,34 +123,43 @@ SUBROUTINE PROPAGS2 (F1, F3, NINF, NSUP, KIJS, KIJL, NANG, ND3SF1, ND3EF1, ND3S, !* DEPTH AND CURRENT REFRACTION. ! ----------------------------- - !$acc kernels present(F1,F3,SUMWN,WLONN,KLON,WLATN,KLAT,WCORN,KCOR, & - !$acc & LLWKPMN,WKPMN,KPM,LLWMPMN,WMPMN,MPM) + !$acc parallel loop independent collapse(3) & + !$acc & present(F1,F3,SUMWN,WLONN,KLON,WLATN,KLAT,WCORN,KCOR, & + !$acc & WKPMN,KPM,WMPMN,MPM) DO M = ND3S, ND3E DO K = 1, NANG + !$loki loop-fusion DO IJ = KIJS, KIJL F3(IJ,K,M) = (1.0_JWRB-SUMWN(IJ,K,M))* F1(IJ,K,M) ENDDO + !$loki loop-unroll DO IC=1,2 IF (LLWLONN(K,M,IC)) THEN + !$loki loop-fusion DO IJ = KIJS, KIJL F3(IJ,K,M) = F3(IJ,K,M) + WLONN(IJ,K,M,IC)*F1(KLON(IJ,IC),K,M) ENDDO ENDIF ENDDO + !$loki loop-unroll DO ICL=1,2 + !$loki loop-unroll DO IC=1,2 IF (LLWLATN(K,M,IC,ICL)) THEN + !$loki loop-fusion DO IJ = KIJS, KIJL F3(IJ,K,M) = F3(IJ,K,M) + WLATN(IJ,K,M,IC,ICL)*F1(KLAT(IJ,IC,ICL),K,M) ENDDO ENDIF ENDDO + !$loki loop-unroll DO ICR=1,4 IF (LLWCORN(K,M,ICR,ICL)) THEN + !$loki loop-fusion DO IJ = KIJS, KIJL F3(IJ,K,M) = F3(IJ,K,M) + WCORN(IJ,K,M,ICR,ICL)*F1(KCOR(IJ,KCR(K,ICR),ICL),K,M) ENDDO @@ -158,15 +167,18 @@ SUBROUTINE PROPAGS2 (F1, F3, NINF, NSUP, KIJS, KIJL, NANG, ND3SF1, ND3EF1, ND3S, ENDDO ENDDO + !$loki loop-unroll DO IC=-1,1,2 IF (LLWKPMN(K,M,IC)) THEN + !$loki loop-fusion DO IJ = KIJS, KIJL F3(IJ,K,M) = F3(IJ,K,M) + WKPMN(IJ,K,M,IC)* F1(IJ,KPM(K,IC),M) ENDDO ENDIF IF (LLWMPMN(K,M,IC)) THEN + !$loki loop-fusion DO IJ = KIJS, KIJL F3(IJ,K,M) = F3(IJ,K,M) + WMPMN(IJ,K,M,IC)* F1(IJ,K,MPM(M,IC)) ENDDO @@ -176,7 +188,7 @@ SUBROUTINE PROPAGS2 (F1, F3, NINF, NSUP, KIJS, KIJL, NANG, ND3SF1, ND3EF1, ND3S, ENDDO ENDDO - !$acc end kernels + !$acc end parallel loop ENDIF From 4b966cf80eae1e08d8dcfdf53005442cff89ae56 Mon Sep 17 00:00:00 2001 From: Ahmad Nawab Date: Mon, 18 Nov 2024 20:13:11 +0000 Subject: [PATCH 4/9] PROPAGS2: switch without refraction option to use acc parallel loop --- src/ecwam/propags2.F90 | 5 +++-- 1 file changed, 3 insertions(+), 2 deletions(-) diff --git a/src/ecwam/propags2.F90 b/src/ecwam/propags2.F90 index 2a7ca9c05..d3af6ef3f 100644 --- a/src/ecwam/propags2.F90 +++ b/src/ecwam/propags2.F90 @@ -96,7 +96,8 @@ SUBROUTINE PROPAGS2 (F1, F3, NINF, NSUP, KIJS, KIJL, NANG, ND3SF1, ND3EF1, ND3S, !* WITHOUT DEPTH OR/AND CURRENT REFRACTION. ! ---------------------------------------- - !$acc kernels present(F1,F3,KLON,KLAT,KCOR,SUMWN,WLONN,WLATN,WCORN,JXO,JYO,KCR,WKPMN,KPM) + !$acc parallel loop independent collapse(3) & + !$acc & present(F1,F3,KLON,KLAT,KCOR,SUMWN,WLONN,WLATN,WCORN,JXO,JYO,KCR,WKPMN,KPM) DO K = 1, NANG DO M = ND3S, ND3E @@ -117,7 +118,7 @@ SUBROUTINE PROPAGS2 (F1, F3, NINF, NSUP, KIJS, KIJL, NANG, ND3SF1, ND3EF1, ND3S, ENDDO ENDDO - !$acc end kernels + !$acc end parallel loop ELSE !* DEPTH AND CURRENT REFRACTION. From 2f389db450521948d9cd404e954ced1961c8411b Mon Sep 17 00:00:00 2001 From: Ahmad Nawab Date: Fri, 15 Nov 2024 19:59:43 +0000 Subject: [PATCH 5/9] RUN_MODEL: make irefra configurable --- share/ecwam/scripts/ecwam_run_model.sh | 3 ++- 1 file changed, 2 insertions(+), 1 deletion(-) diff --git a/share/ecwam/scripts/ecwam_run_model.sh b/share/ecwam/scripts/ecwam_run_model.sh index 51cf1ded9..6542ffd1f 100755 --- a/share/ecwam/scripts/ecwam_run_model.sh +++ b/share/ecwam/scripts/ecwam_run_model.sh @@ -85,6 +85,7 @@ nproma=$(read_config nproma --default=24) iphys=$(read_config iphys --default=1) llgcbz0=$(read_config llgcbz0 --default=F) llnormagam=$(read_config llnormagam --default=F) +irefra=$(read_config irefra --default=0) # read timesteps phys_tstp=$(read_config physics.timestep --format=seconds --default=900) @@ -228,7 +229,7 @@ cat > wam_namelist << EOF LLNORMAGAM = ${llnormagam}, IPROPAGS = 2, LSUBGRID = F, - IREFRA = 0, + IREFRA = ${irefra}, LICERUN = ${licerun}, LMASKICE = T, LWAMRSETCI = T, From 69ac178b68e64e0f236df0831d4709067e6dc551 Mon Sep 17 00:00:00 2001 From: Ahmad Nawab Date: Fri, 15 Nov 2024 20:00:17 +0000 Subject: [PATCH 6/9] RUN_MODEL: set CDATECURA to beginning of forcings time --- share/ecwam/scripts/ecwam_run_model.sh | 1 + 1 file changed, 1 insertion(+) diff --git a/share/ecwam/scripts/ecwam_run_model.sh b/share/ecwam/scripts/ecwam_run_model.sh index 6542ffd1f..b6126b020 100755 --- a/share/ecwam/scripts/ecwam_run_model.sh +++ b/share/ecwam/scripts/ecwam_run_model.sh @@ -212,6 +212,7 @@ cat > wam_namelist << EOF CBPLTDT = "${begofrn}", CEPLTDT = "${endofrn}", CDATEF = "${begoffo}", + CDATECURA = "${begoffo}", DELPRO_LF = ${adv_fast_tstp}, IFRELFMAX = ${ifrelfmax}, IDELPRO = ${adv_base_tstp}, From 239efbbee210b27c7b0d2cf979c336180875aa51 Mon Sep 17 00:00:00 2001 From: Ahmad Nawab Date: Fri, 15 Nov 2024 21:03:56 +0000 Subject: [PATCH 7/9] 49r2: add O320 test with current refraction --- tests/etopo1_oper_an_fc_O320_cy49r2.yml | 119 ++++++++++++++++++++++++ 1 file changed, 119 insertions(+) create mode 100644 tests/etopo1_oper_an_fc_O320_cy49r2.yml diff --git a/tests/etopo1_oper_an_fc_O320_cy49r2.yml b/tests/etopo1_oper_an_fc_O320_cy49r2.yml new file mode 100644 index 000000000..5da377a17 --- /dev/null +++ b/tests/etopo1_oper_an_fc_O320_cy49r2.yml @@ -0,0 +1,119 @@ +grid: O320 +directions: 24 +frequencies: 29 +bathymetry: ETOPO1 + +advection: + timestep: 450 +physics: + timestep: 900 + +analysis.begin: 2022-12-31 12:00:00 +analysis.end: 2023-01-01 00:00:00 +forecast.begin: 2023-01-01 00:00:00 +forecast.end: 2023-01-01 06:00:00 + +begin: ${analysis.begin} +end: ${forecast.end} + +nproma: 64 +irefra: 2 + +forcings: + file: data/forcings/oper_an_12h_fc_2023010100_36h_O320.grib + + at: + - begin: ${analysis.begin} + end: ${analysis.end} + timestep: 06:00 + - begin: ${forecast.begin} + end: ${forecast.end} + timestep: 01:00 + +output: + fields: + name: + - swh # Significant height of combined wind waves and swell + - mwd # Mean wave direction + - mwp # Mean wave period + - pp1d # Peak wave period + - dwi # 10 metre wind direction + - cdww # Coefficient of drag with waves + - wind # 10 metre wind speed + format: grib # (default : grib) or binary + at: + - timestep: 01:00 + + restart: + format: binary # (default : binary) or grib + at: + - time: ${end} + + +validation: + + double_precision: + + # initial analysis time + - name: swh + time: 2022-12-31 12:00:00 + average: 0.1334550051820419E+01 + relative_tolerance: 1.e-14 + hashes: ['0x3FF55A5127B719D3'] + + # initial forecast time + - name: swh + time: 2023-01-01 00:00:00 + average: 0.1522843329260829E+01 + relative_tolerance: 1.e-14 + hashes: ['0x3FF85D90F781B65D'] + + # 6h into forecast + - name: swh + time: 2023-01-01 06:00:00 + average: 0.1602511331925879E+01 + relative_tolerance: 1.e-14 + hashes: ['0x3FF9A3E2EC2174F4'] + - name: swh + time: 2023-01-01 06:00:00 + minimum: 0.1739656079628811E-01 + relative_tolerance: 1.e-14 + hashes: ['0x3F91D0676EBB52D5'] + - name: swh + time: 2023-01-01 06:00:00 + maximum: 0.7461369035298830E+01 + relative_tolerance: 1.e-14 + hashes: ['0x401DD8711FD7FB70'] + + single_precision: + + # initial analysis time + - name: swh + time: 2022-12-31 12:00:00 + average: 0.1334386110305786E+01 + relative_tolerance: 1.e-6 + hashes: ['0x3FF559A540000000'] + + # initial forecast time + - name: swh + time: 2023-01-01 00:00:00 + average: 0.1522840261459351E+01 + relative_tolerance: 1.e-6 + hashes: ['0x3FF85D8DC0000000'] + + # 6h into forecast + - name: swh + time: 2023-01-01 06:00:00 + average: 0.1602509140968323E+01 + relative_tolerance: 1.e-6 + hashes: ['0x3FF9A3E0A0000000'] + - name: swh + time: 2023-01-01 06:00:00 + minimum: 0.1733318902552128E-01 + relative_tolerance: 1.e-6 + hashes: ['0x3F91BFCAA0000000'] + - name: swh + time: 2023-01-01 06:00:00 + maximum: 0.7461367130279541E+01 + relative_tolerance: 1.e-6 + hashes: ['0x401DD870A0000000'] From 02c49f532fe47eb2af44a6c60204299d1d3391cb Mon Sep 17 00:00:00 2001 From: Ahmad Nawab Date: Thu, 28 Nov 2024 13:18:25 +0000 Subject: [PATCH 8/9] Update README for GPU enabled versions --- README.md | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/README.md b/README.md index 7c18ce2b7..76849d36c 100644 --- a/README.md +++ b/README.md @@ -227,7 +227,7 @@ translation toolchain Loki. Currently, three Loki transformations are supported: The scc-hoist and scc-stack transformations offer superior performance to the scc transformation. Currently, only the OpenACC programming model on Nvidia GPUs is supported. -NB: GPU offload is not yet supported for ecWAM 1.4.x. +NB: GPU offload is not supported for ecWAM 1.4.0. Building -------- From 3dd78a4817012212d25d5455f4319e907596c891 Mon Sep 17 00:00:00 2001 From: Ahmad Nawab Date: Wed, 4 Dec 2024 09:13:24 +0000 Subject: [PATCH 9/9] Change CDATECURA to begofrn and make IDELCUR configurable --- share/ecwam/scripts/ecwam_run_model.sh | 4 +++- tests/etopo1_oper_an_fc_O320_cy49r2.yml | 3 +++ 2 files changed, 6 insertions(+), 1 deletion(-) diff --git a/share/ecwam/scripts/ecwam_run_model.sh b/share/ecwam/scripts/ecwam_run_model.sh index b6126b020..69d292cbd 100755 --- a/share/ecwam/scripts/ecwam_run_model.sh +++ b/share/ecwam/scripts/ecwam_run_model.sh @@ -92,6 +92,7 @@ phys_tstp=$(read_config physics.timestep --format=seconds --default=900) adv_base_tstp=$(read_config advection.timestep --format=seconds --default=900) adv_fast_tstp=$(read_config advection.fast_waves.timestep --format=seconds --default=$adv_base_tstp) ifrelfmax=$(read_config advection.fast_waves.max_frequency --default=0) +idelcur=$(read_config currents.input_step --default=86400) # verify timesteps if [ $(( $adv_base_tstp%$adv_fast_tstp )) -ne 0 ] ; then @@ -212,12 +213,13 @@ cat > wam_namelist << EOF CBPLTDT = "${begofrn}", CEPLTDT = "${endofrn}", CDATEF = "${begoffo}", - CDATECURA = "${begoffo}", + CDATECURA = "${begofrn}", DELPRO_LF = ${adv_fast_tstp}, IFRELFMAX = ${ifrelfmax}, IDELPRO = ${adv_base_tstp}, IDELT = ${phys_tstp}, IDELINT = ${ppfreq}, + IDELCUR = ${idelcur} IREST = 1, LFDBIOOUT = F, LFDB = F, diff --git a/tests/etopo1_oper_an_fc_O320_cy49r2.yml b/tests/etopo1_oper_an_fc_O320_cy49r2.yml index 5da377a17..de155577a 100644 --- a/tests/etopo1_oper_an_fc_O320_cy49r2.yml +++ b/tests/etopo1_oper_an_fc_O320_cy49r2.yml @@ -30,6 +30,9 @@ forcings: end: ${forecast.end} timestep: 01:00 +currents: + input_step: 86400 + output: fields: name: