diff --git a/src/trans/common/internal/tpm_distr.F90 b/src/trans/common/internal/tpm_distr.F90 index 6ae338445..eddb16843 100755 --- a/src/trans/common/internal/tpm_distr.F90 +++ b/src/trans/common/internal/tpm_distr.F90 @@ -171,7 +171,8 @@ MODULE TPM_DISTR REAL(KIND=JPRD) ,ALLOCATABLE :: RWEIGHT(:) ! Weight per grid-point (if weighted distribution) INTEGER(KIND=JPIM) ,ALLOCATABLE :: NPROCA_GP(:) ! Number of grid-points per a-set -INTEGER(KIND=JPIB), ALLOCATABLE :: OFFSETS_GEMM1(:), OFFSETS_GEMM2(:) +INTEGER(KIND=JPIB), ALLOCATABLE :: OFFSETS_GEMM1(:), OFFSETS_GEMM2(:), OFFSETS_GEMM_MATRIX(:) +INTEGER(KIND=JPIM), ALLOCATABLE :: LEGENDRE_MATRIX_STRIDES(:) END TYPE DISTR_TYPE diff --git a/src/trans/gpu/algor/hicblas_cutlass.cuda.h b/src/trans/gpu/algor/hicblas_cutlass.cuda.h index 9a42bb2f6..8b0eacf88 100644 --- a/src/trans/gpu/algor/hicblas_cutlass.cuda.h +++ b/src/trans/gpu/algor/hicblas_cutlass.cuda.h @@ -154,11 +154,11 @@ class cutlass_sgemm_grouped { } // namespace detail template -void cutlass_sgemm_wrapper_grouped_op(int resol_id, int blas_id, int m, int *n, int *k, +void cutlass_sgemm_wrapper_grouped_op(int resol_id, int blas_id, int m, const int *n, const int *k, float alpha, const float *A, int lda, - int64_t *offsetsA, const float *B, int ldb, - int64_t *offsetsB, float beta, float *C, - int ldc, int64_t *offsetsC, int batchCount, + const int64_t *offsetsA, const float *B, const int *ldb, + const int64_t *offsetsB, float beta, float *C, + int ldc, const int64_t *offsetsC, int batchCount, cudaStream_t stream, void *growing_allocator) { using namespace detail; @@ -182,10 +182,10 @@ void cutlass_sgemm_wrapper_grouped_op(int resol_id, int blas_id, int m, int *n, } void cutlass_sgemm_wrapper_grouped(int resol_id, int blas_id, char transa, char transb, - int m, int *n, int *k, float alpha, - const float *A, int lda, int64_t *offsetsA, - const float *B, int ldb, int64_t *offsetsB, float beta, - float *C, int ldc, int64_t *offsetsC, + int m, const int *n, const int *k, float alpha, + const float *A, int lda, const int64_t *offsetsA, + const float *B, const int *ldb, const int64_t *offsetsB, float beta, + float *C, int ldc, const int64_t *offsetsC, int batchCount, cudaStream_t stream, void *growing_allocator) { diff --git a/src/trans/gpu/algor/hicblas_gemm.hip.cpp b/src/trans/gpu/algor/hicblas_gemm.hip.cpp index 252dc7dd4..f9caa0383 100644 --- a/src/trans/gpu/algor/hicblas_gemm.hip.cpp +++ b/src/trans/gpu/algor/hicblas_gemm.hip.cpp @@ -61,7 +61,7 @@ template auto &get_graph_cache() { template auto &get_ptr_cache() { using real_t = typename Gemm::real_type; static std::unordered_map< - cache_key, std::tuple> + cache_key, std::tuple> ptrCache; return ptrCache; } @@ -89,10 +89,11 @@ template void erase_from_caches(int resol_id) { // this version is using graphs and caches the graphs template -void run_group_graph(Gemm &&gemm, int resol_id, int m, int *n, int *k, - Real alpha, const Real *A, int lda, int64_t *offsetsA, - const Real *B, int ldb, int64_t *offsetsB, Real beta, - Real *C, int ldc, int64_t *offsetsC, int batchCount, +void run_group_graph(Gemm &&gemm, int resol_id, int m, const int *n, + const int *k, Real alpha, const Real *A, int lda, + const int64_t *offsetsA, const Real *B, const int *ldb, + const int64_t *offsetsB, Real beta, Real *C, int ldc, + const int64_t *offsetsC, int batchCount, hipStream_t stream, int blas_id, void *growing_allocator) { growing_allocator_register_free_c(growing_allocator, free_gemm_graph_cache); @@ -138,7 +139,7 @@ void run_group_graph(Gemm &&gemm, int resol_id, int m, int *n, int *k, HIC_CHECK(hipStreamBeginCapture(stream, hipStreamCaptureModeGlobal)); gemm(stream, m, n[i], k[i], alpha, A + offsetsA[i], lda, B + offsetsB[i], - ldb, beta, C + offsetsC[i], ldc); + ldb[i], beta, C + offsetsC[i], ldc); hipGraph_t my_graph; HIC_CHECK(hipStreamEndCapture(stream, &my_graph)); hipGraphNode_t my_node; @@ -163,16 +164,16 @@ void run_group_graph(Gemm &&gemm, int resol_id, int m, int *n, int *k, // stupid simple gemm calls template -void run_group(Gemm &&gemm, int resol_id, int m, int *n, int *k, Real alpha, - const Real *A, int lda, int64_t *offsetsA, const Real *B, - int ldb, int64_t *offsetsB, Real beta, Real *C, int ldc, - int64_t *offsetsC, int batchCount, hipStream_t stream, - int = -1) { +void run_group(Gemm &&gemm, int resol_id, int m, const int *n, const int *k, + Real alpha, const Real *A, int lda, const int64_t *offsetsA, + const Real *B, const int *ldb, const int64_t *offsetsB, + Real beta, Real *C, int ldc, const int64_t *offsetsC, + int batchCount, hipStream_t stream, int = -1) { for (int i = 0; i < batchCount; ++i) { if (m == 0 || n[i] == 0 || k[i] == 0) continue; gemm(stream, m, n[i], k[i], alpha, A + offsetsA[i], lda, B + offsetsB[i], - ldb, beta, C + offsetsC[i], ldc); + ldb[i], beta, C + offsetsC[i], ldc); } } @@ -215,14 +216,12 @@ template struct hipblas_gemm_grouped { #ifndef USE_CUTLASS -void hipblas_sgemm_wrapper_grouped(int resol_id, int blas_id, char transa, - char transb, int m, int *n, int *k, - float alpha, const float *A, int lda, - int64_t *offsetsA, const float *B, int ldb, - int64_t *offsetsB, float beta, float *C, - int ldc, int64_t *offsetsC, int batchCount, - hipStream_t stream, - void *growing_allocator) { +void hipblas_sgemm_wrapper_grouped( + int resol_id, int blas_id, char transa, char transb, int m, const int *n, + const int *k, float alpha, const float *A, int lda, const int64_t *offsetsA, + const float *B, const int *ldb, const int64_t *offsetsB, float beta, + float *C, int ldc, const int64_t *offsetsC, int batchCount, + hipStream_t stream, void *growing_allocator) { hipblasOperation_t op_t1 = HIPBLAS_OP_N, op_t2 = HIPBLAS_OP_N; if (transa == 'T' || transa == 't') @@ -244,12 +243,13 @@ void hipblas_sgemm_wrapper_grouped(int resol_id, int blas_id, char transa, #endif void hipblas_dgemm_wrapper_grouped(int resol_id, int blas_id, char transa, - char transb, int m, int *n, int *k, - double alpha, const double *A, int lda, - int64_t *offsetsA, const double *B, int ldb, - int64_t *offsetsB, double beta, double *C, - int ldc, int64_t *offsetsC, int batchCount, - hipStream_t stream, void *) { + char transb, int m, const int *n, + const int *k, double alpha, const double *A, + int lda, const int64_t *offsetsA, + const double *B, const int *ldb, + const int64_t *offsetsB, double beta, + double *C, int ldc, const int64_t *offsetsC, + int batchCount, hipStream_t stream, void *) { hipblasOperation_t op_t1 = HIPBLAS_OP_N, op_t2 = HIPBLAS_OP_N; if (transa == 'T' || transa == 't') @@ -313,13 +313,12 @@ void hipblas_sgemm_wrapper(char transa, char transb, int m, int n, int k, batchCount)); } -void hipblas_sgemm_wrapper_grouped(int resol_id, int blas_id, char transa, - char transb, int m, int *n, int *k, - float alpha, const float *A, int lda, - int64_t *offsetsA, const float *B, int ldb, - int64_t *offsetsB, float beta, float *C, - int ldc, int64_t *offsetsC, int batchCount, - size_t stream, void *growing_allocator) { +void hipblas_sgemm_wrapper_grouped( + int resol_id, int blas_id, char transa, char transb, int m, const int *n, + const int *k, float alpha, const float *A, int lda, const int64_t *offsetsA, + const float *B, const int *ldb, const int64_t *offsetsB, float beta, + float *C, int ldc, const int64_t *offsetsC, int batchCount, size_t stream, + void *growing_allocator) { #ifdef USE_CUTLASS cutlass_sgemm_wrapper_grouped(resol_id, blas_id, transa, transb, m, n, k, alpha, A, lda, offsetsA, B, ldb, offsetsB, beta, @@ -334,12 +333,14 @@ void hipblas_sgemm_wrapper_grouped(int resol_id, int blas_id, char transa, } void hipblas_dgemm_wrapper_grouped(int resol_id, int blas_id, char transa, - char transb, int m, int *n, int *k, - double alpha, const double *A, int lda, - int64_t *offsetsA, const double *B, int ldb, - int64_t *offsetsB, double beta, double *C, - int ldc, int64_t *offsetsC, int batchCount, - size_t stream, void *growing_allocator) { + char transb, int m, const int *n, + const int *k, double alpha, const double *A, + int lda, const int64_t *offsetsA, + const double *B, const int *ldb, + const int64_t *offsetsB, double beta, + double *C, int ldc, const int64_t *offsetsC, + int batchCount, size_t stream, + void *growing_allocator) { hipblas_dgemm_wrapper_grouped(resol_id, blas_id, transa, transb, m, n, k, alpha, A, lda, offsetsA, B, ldb, offsetsB, beta, C, ldc, offsetsC, batchCount, diff --git a/src/trans/gpu/algor/hicblas_mod.F90 b/src/trans/gpu/algor/hicblas_mod.F90 index 6ffc90812..21adae46d 100644 --- a/src/trans/gpu/algor/hicblas_mod.F90 +++ b/src/trans/gpu/algor/hicblas_mod.F90 @@ -87,8 +87,8 @@ SUBROUTINE HIP_DGEMM_GROUPED( & &) BIND(C, NAME='hipblas_dgemm_wrapper_grouped') USE ISO_C_BINDING, ONLY: C_CHAR, C_INT, C_DOUBLE, C_SIZE_T, C_PTR, C_INT64_T CHARACTER(1,C_CHAR), VALUE :: CTA, CTB - INTEGER(C_INT), VALUE :: RESOL_ID, BLAS_ID, M, LDA, LDB, LDC, BATCHCOUNT - INTEGER(C_INT) :: N(*), K(*) + INTEGER(C_INT), VALUE :: RESOL_ID, BLAS_ID, M, LDA, LDC, BATCHCOUNT + INTEGER(C_INT) :: N(*), K(*), LDB(*) INTEGER(C_INT64_T) :: OFFSETA(*), OFFSETB(*), OFFSETC(*) REAL(C_DOUBLE), VALUE :: ALPHA,BETA REAL(C_DOUBLE) :: A(*), B(*), C(*) @@ -108,8 +108,8 @@ SUBROUTINE HIP_SGEMM_GROUPED( & &) BIND(C, NAME='hipblas_sgemm_wrapper_grouped') USE ISO_C_BINDING, ONLY: C_CHAR, C_INT, C_FLOAT, C_SIZE_T, C_PTR, C_INT64_T CHARACTER(1,C_CHAR), VALUE :: CTA, CTB - INTEGER(C_INT), VALUE :: RESOL_ID, BLAS_ID, M, LDA, LDB, LDC, BATCHCOUNT - INTEGER(C_INT) :: N(*), K(*) + INTEGER(C_INT), VALUE :: RESOL_ID, BLAS_ID, M, LDA, LDC, BATCHCOUNT + INTEGER(C_INT) :: N(*), K(*), LDB(*) INTEGER(C_INT64_T) :: OFFSETA(*), OFFSETB(*), OFFSETC(*) REAL(C_FLOAT), VALUE :: ALPHA,BETA REAL(C_FLOAT) :: A(*), B(*), C(*) @@ -247,7 +247,7 @@ SUBROUTINE HIP_DGEMM_GROUPED_OVERLOAD( & INTEGER(KIND=JPIM) :: LDA INTEGER(KIND=JPIB) :: OFFSETA(:) REAL(KIND=JPRD), DIMENSION(*) :: BARRAY - INTEGER(KIND=JPIM) :: LDB + INTEGER(KIND=JPIM) :: LDB(:) INTEGER(KIND=JPIB) :: OFFSETB(:) REAL(KIND=JPRD) :: BETA REAL(KIND=JPRD), DIMENSION(:) :: CARRAY @@ -297,8 +297,8 @@ SUBROUTINE HIP_SGEMM_GROUPED_OVERLOAD(& REAL(KIND=JPRM), DIMENSION(:) :: AARRAY INTEGER(KIND=JPIM) :: LDA INTEGER(KIND=JPIB) :: OFFSETA(:) - REAL(KIND=JPRM), DIMENSION(:,:,:) :: BARRAY - INTEGER(KIND=JPIM) :: LDB + REAL(KIND=JPRM), DIMENSION(*) :: BARRAY + INTEGER(KIND=JPIM) :: LDB(:) INTEGER(KIND=JPIB) :: OFFSETB(:) REAL(KIND=JPRM) :: BETA REAL(KIND=JPRM), DIMENSION(:) :: CARRAY diff --git a/src/trans/gpu/external/setup_trans.F90 b/src/trans/gpu/external/setup_trans.F90 index b1a44fec3..d6ef1cbf5 100755 --- a/src/trans/gpu/external/setup_trans.F90 +++ b/src/trans/gpu/external/setup_trans.F90 @@ -172,7 +172,7 @@ SUBROUTINE SETUP_TRANS(KSMAX,KDGL,KDLON,KLOEN,LDSPLIT,PSTRET,& #endif INTEGER :: INUMDEVS, IDEV, MYGPU -#include "user_clock.intfb.h" +REAL(KIND=JPRBT), POINTER :: LOCAL_ARR(:,:) ! ------------------------------------------------------------------ IF (LHOOK) CALL DR_HOOK('SETUP_TRANS',0,ZHOOK_HANDLE) @@ -484,11 +484,11 @@ SUBROUTINE SETUP_TRANS(KSMAX,KDGL,KDLON,KLOEN,LDSPLIT,PSTRET,& ! Initialize A arrays - ALLOCATE(FG%ZAA(ALIGN(R%NDGNH,8),ALIGN((R%NTMAX+2)/2,8),D%NUMP)) - ALLOCATE(FG%ZAS(ALIGN(R%NDGNH,8),ALIGN((R%NTMAX+3)/2,8),D%NUMP)) + ALLOCATE(FG%ZAA(D%OFFSETS_GEMM_MATRIX(D%NUMP+1))) + ALLOCATE(FG%ZAS(D%OFFSETS_GEMM_MATRIX(D%NUMP+1))) - FG%ZAA(:,:,:) = 0._JPRBT - FG%ZAS(:,:,:) = 0._JPRBT + FG%ZAA(:) = 0._JPRBT + FG%ZAS(:) = 0._JPRBT DO JMLOC=1,D%NUMP KM = D%MYMS(JMLOC) @@ -496,23 +496,28 @@ SUBROUTINE SETUP_TRANS(KSMAX,KDGL,KDLON,KLOEN,LDSPLIT,PSTRET,& ILA = (R%NSMAX-KM+2)/2 ILS = (R%NSMAX-KM+3)/2 - FG%ZAA(1:KDGLU,1:ILA,JMLOC)=S%FA(JMLOC)%RPNMA(1:KDGLU,1:ILA) - FG%ZAS(1:KDGLU,1:ILS,JMLOC)=S%FA(JMLOC)%RPNMS(1:KDGLU,1:ILS) + IF (KM /= 0) THEN + CALL C_F_POINTER(C_LOC(FG%ZAA(1+D%OFFSETS_GEMM_MATRIX(JMLOC))), LOCAL_ARR, & + & (/D%LEGENDRE_MATRIX_STRIDES(JMLOC),ILA/)) + LOCAL_ARR(1:KDGLU,1:ILA) = S%FA(JMLOC)%RPNMA(1:KDGLU,1:ILA) + + CALL C_F_POINTER(C_LOC(FG%ZAS(1+D%OFFSETS_GEMM_MATRIX(JMLOC))), LOCAL_ARR, & + & (/D%LEGENDRE_MATRIX_STRIDES(JMLOC),ILS/)) + LOCAL_ARR(1:KDGLU,1:ILS) = S%FA(JMLOC)%RPNMS(1:KDGLU,1:ILS) + ELSE + ALLOCATE(FG%ZAA0(ALIGN(KDGLU,8),ILA)) + ALLOCATE(FG%ZAS0(ALIGN(KDGLU,8),ILS)) + + FG%ZAA0(:,:) = 0 + FG%ZAS0(:,:) = 0 + FG%ZAA0(1:KDGLU,1:ILA)=S%FA(JMLOC)%RPNMA(1:KDGLU,1:ILA) + FG%ZAS0(1:KDGLU,1:ILS)=S%FA(JMLOC)%RPNMS(1:KDGLU,1:ILS) + ENDIF ENDDO - - ! arrays for m=0 in ledir_mod: - IMLOC0 = FINDLOC(D%MYMS,0) - IF(IMLOC0(1) > 0) THEN - ALLOCATE(FG%ZAA0(SIZE(FG%ZAA,1),SIZE(FG%ZAA,2))) - ALLOCATE(FG%ZAS0(SIZE(FG%ZAS,1),SIZE(FG%ZAS,2))) - FG%ZAA0 = FG%ZAA(:,:,IMLOC0(1)) - FG%ZAS0 = FG%ZAS(:,:,IMLOC0(1)) - ENDIF - + ALLOCATE(FG%ZEPSNM(D%NUMP,0:R%NTMAX+2)) FG%ZEPSNM = 0._JPRBT - CALL PREPSNM !Initialize on the host - + CALL PREPSNM WRITE(NOUT,*)'setup_trans: sizes1 NUMP=',D%NUMP #ifdef ACCGPU WRITE(NOUT,*) 'Using OpenACC' @@ -520,13 +525,13 @@ SUBROUTINE SETUP_TRANS(KSMAX,KDGL,KDLON,KLOEN,LDSPLIT,PSTRET,& #ifdef OMPGPU WRITE(NOUT,*) 'Using OpenMP offloading' #endif - WRITE(NOUT,'(A10,":",I9,"B")') 'FG%ZAS', C_SIZEOF(FG%ZAS(1,1,1))*SIZE(FG%ZAS) - WRITE(NOUT,'(A10,":",I9,"B")') 'FG%ZAA', C_SIZEOF(FG%ZAA(1,1,1))*SIZE(FG%ZAA) - WRITE(NOUT,'(A10,":",I9,"B")') 'FG%ZAS0', C_SIZEOF(FG%ZAS0(1,1))*SIZE(FG%ZAS0) - WRITE(NOUT,'(A10,":",I9,"B")') 'FG%ZAA0', C_SIZEOF(FG%ZAA0(1,1))*SIZE(FG%ZAA0) - WRITE(NOUT,'(A10,":",I9,"B")') 'FG%ZEPSNM', C_SIZEOF(FG%ZEPSNM(1,1))*SIZE(FG%ZEPSNM) + WRITE(NOUT,'(A10,":",I11,"B")') 'FG%ZAS', C_SIZEOF(FG%ZAS(1))*SIZE(FG%ZAS) + WRITE(NOUT,'(A10,":",I11,"B")') 'FG%ZAA', C_SIZEOF(FG%ZAA(1))*SIZE(FG%ZAA) + WRITE(NOUT,'(A10,":",I11,"B")') 'FG%ZAS0', C_SIZEOF(FG%ZAS0(1,1))*SIZE(FG%ZAS0) + WRITE(NOUT,'(A10,":",I11,"B")') 'FG%ZAA0', C_SIZEOF(FG%ZAA0(1,1))*SIZE(FG%ZAA0) + WRITE(NOUT,'(A10,":",I11,"B")') 'FG%ZEPSNM', C_SIZEOF(FG%ZEPSNM(1,1))*SIZE(FG%ZEPSNM) - IF (IMLOC0(1) > 0) THEN + IF (ANY(D%MYMS == 0)) THEN #ifdef ACCGPU !$ACC ENTER DATA COPYIN(FG%ZAA0,FG%ZAS0) ASYNC(1) #endif diff --git a/src/trans/gpu/internal/ledir_mod.F90 b/src/trans/gpu/internal/ledir_mod.F90 index c834cb993..e7a1fb18a 100755 --- a/src/trans/gpu/internal/ledir_mod.F90 +++ b/src/trans/gpu/internal/ledir_mod.F90 @@ -211,7 +211,7 @@ SUBROUTINE LEDIR(ALLOCATOR,ZINPS,ZINPA,ZINPS0,ZINPA0,ZOUT,ZOUT0,POA1,KF_FS) NS(KMLOC) = (R_NSMAX-KM+2)/2 KS(KMLOC) = G_NDGLU(KM) AOFFSETS(KMLOC) = IIN_STRIDES0*D_OFFSETS_GEMM1(KMLOC) - BOFFSETS(KMLOC) = SIZE(ZAA,1)*SIZE(ZAA,2)*(KMLOC-1) + BOFFSETS(KMLOC) = D%OFFSETS_GEMM_MATRIX(KMLOC) COFFSETS(KMLOC) = IOUT_STRIDES0*D_OFFSETS_GEMM2(KMLOC) ENDDO IF(IMLOC0(1) > 0) THEN @@ -230,7 +230,7 @@ SUBROUTINE LEDIR(ALLOCATOR,ZINPS,ZINPA,ZINPS0,ZINPA0,ZOUT,ZOUT0,POA1,KF_FS) & 2*KF_FS, NS(:), KS(:), & & 1.0_JPRBT, & & ZINPA, IIN_STRIDES0, AOFFSETS, & - & ZAA, SIZE(ZAA,1), BOFFSETS, & + & ZAA, D%LEGENDRE_MATRIX_STRIDES, BOFFSETS, & & 0.0_JPRBT, & & ZOUT, IOUT_STRIDES0, COFFSETS, & & D_NUMP, STREAM=1_C_INT, ALLOC=ALLOCATOR%PTR) @@ -331,7 +331,7 @@ SUBROUTINE LEDIR(ALLOCATOR,ZINPS,ZINPA,ZINPS0,ZINPA0,ZOUT,ZOUT0,POA1,KF_FS) NS(KMLOC) = (R_NSMAX-KM+3)/2 KS(KMLOC) = G_NDGLU(KM) AOFFSETS(KMLOC) = IIN_STRIDES0*D_OFFSETS_GEMM1(KMLOC) - BOFFSETS(KMLOC) = SIZE(ZAS,1)*SIZE(ZAS,2)*(KMLOC-1) + BOFFSETS(KMLOC) = D%OFFSETS_GEMM_MATRIX(KMLOC) COFFSETS(KMLOC) = IOUT_STRIDES0*D_OFFSETS_GEMM2(KMLOC) ENDDO IF(IMLOC0(1) > 0) THEN @@ -350,7 +350,7 @@ SUBROUTINE LEDIR(ALLOCATOR,ZINPS,ZINPA,ZINPS0,ZINPA0,ZOUT,ZOUT0,POA1,KF_FS) & 2*KF_FS, NS(:), KS(:), & & 1.0_JPRBT, & & ZINPS, IIN_STRIDES0, AOFFSETS, & - & ZAS, SIZE(ZAS,1), BOFFSETS, & + & ZAS, D%LEGENDRE_MATRIX_STRIDES, BOFFSETS, & & 0.0_JPRBT, & & ZOUT, IOUT_STRIDES0, COFFSETS, & & D_NUMP, STREAM=1_C_INT, ALLOC=ALLOCATOR%PTR) diff --git a/src/trans/gpu/internal/leinv_mod.F90 b/src/trans/gpu/internal/leinv_mod.F90 index f2738f44b..8bfc2ac0e 100755 --- a/src/trans/gpu/internal/leinv_mod.F90 +++ b/src/trans/gpu/internal/leinv_mod.F90 @@ -271,7 +271,7 @@ SUBROUTINE LEINV(ALLOCATOR,PIA,ZINP,ZINP0,ZOUTS,ZOUTA,ZOUTS0,ZOUTA0,KF_LEG) KS(KMLOC) = (R_NSMAX-KM+2)/2 NS(KMLOC) = G_NDGLU(KM) AOFFSETS(KMLOC) = IIN_STRIDES0*D_OFFSETS_GEMM2(KMLOC) - BOFFSETS(KMLOC) = SIZE(ZAA,1)*SIZE(ZAA,2)*(KMLOC-1) + BOFFSETS(KMLOC) = D%OFFSETS_GEMM_MATRIX(KMLOC) COFFSETS(KMLOC) = IOUT_STRIDES0*D_OFFSETS_GEMM1(KMLOC) ENDDO IF(IMLOC0(1) > 0) THEN @@ -290,7 +290,7 @@ SUBROUTINE LEINV(ALLOCATOR,PIA,ZINP,ZINP0,ZOUTS,ZOUTA,ZOUTS0,ZOUTA0,KF_LEG) & 2*KF_LEG, NS(:), KS(:), & & 1.0_JPRBT, & & ZINP, IIN_STRIDES0, AOFFSETS, & - & ZAA, SIZE(ZAA,1), BOFFSETS, & + & ZAA, D%LEGENDRE_MATRIX_STRIDES, BOFFSETS, & & 0.0_JPRBT, & & ZOUTA, IOUT_STRIDES0, COFFSETS, & & D_NUMP, STREAM=1_C_INT, ALLOC=ALLOCATOR%PTR) @@ -411,7 +411,7 @@ SUBROUTINE LEINV(ALLOCATOR,PIA,ZINP,ZINP0,ZOUTS,ZOUTA,ZOUTS0,ZOUTA0,KF_LEG) KS(KMLOC) = (R_NSMAX-KM+3)/2 NS(KMLOC) = G_NDGLU(KM) AOFFSETS(KMLOC) = IIN_STRIDES0*D_OFFSETS_GEMM2(KMLOC) - BOFFSETS(KMLOC) = SIZE(ZAS,1)*SIZE(ZAS,2)*(KMLOC-1) + BOFFSETS(KMLOC) = D%OFFSETS_GEMM_MATRIX(KMLOC) COFFSETS(KMLOC) = IOUT_STRIDES0*D_OFFSETS_GEMM1(KMLOC) ENDDO IF(IMLOC0(1) > 0) THEN @@ -430,7 +430,7 @@ SUBROUTINE LEINV(ALLOCATOR,PIA,ZINP,ZINP0,ZOUTS,ZOUTA,ZOUTS0,ZOUTA0,KF_LEG) & 2*KF_LEG, NS(:), KS(:), & & 1.0_JPRBT, & & ZINP, IIN_STRIDES0, AOFFSETS, & - & ZAS, SIZE(ZAS,1), BOFFSETS, & + & ZAS, D%LEGENDRE_MATRIX_STRIDES, BOFFSETS, & & 0.0_JPRBT, & & ZOUTS, IOUT_STRIDES0, COFFSETS, & & D_NUMP, STREAM=1_C_INT, ALLOC=ALLOCATOR%PTR) diff --git a/src/trans/gpu/internal/sump_trans_mod.F90 b/src/trans/gpu/internal/sump_trans_mod.F90 index 5a1de3028..3c3b94d69 100755 --- a/src/trans/gpu/internal/sump_trans_mod.F90 +++ b/src/trans/gpu/internal/sump_trans_mod.F90 @@ -41,7 +41,7 @@ SUBROUTINE SUMP_TRANS INTEGER(KIND=JPIM) :: JM INTEGER(KIND=JPIM) :: JGL,IGL,IPLAT,ISENDSET,IRECVSET,JML,IPOS,IM INTEGER(KIND=JPIM) :: IGPTOT,IMEDIAP,IRESTM,JA,JB,IOFF,KMLOC,KM -INTEGER(KIND=JPIB) :: OFFSET1,OFFSET2 +INTEGER(KIND=JPIB) :: OFFSET1,OFFSET2,OFFSET3 INTEGER(KIND=JPIM),ALLOCATABLE :: IGPTOTL(:,:) REAL(KIND=JPRD),ALLOCATABLE :: ZDUM(:) @@ -272,23 +272,33 @@ SUBROUTINE SUMP_TRANS ALLOCATE(D%OFFSETS_GEMM1(D%NUMP+1)) ALLOCATE(D%OFFSETS_GEMM2(D%NUMP+1)) +ALLOCATE(D%OFFSETS_GEMM_MATRIX(D%NUMP+1)) +ALLOCATE(D%LEGENDRE_MATRIX_STRIDES(D%NUMP)) OFFSET1 = 0 OFFSET2 = 0 +OFFSET3 = 0 DO KMLOC=1,D%NUMP KM = D%MYMS(KMLOC) D%OFFSETS_GEMM1(KMLOC) = OFFSET1 D%OFFSETS_GEMM2(KMLOC) = OFFSET2 + D%OFFSETS_GEMM_MATRIX(KMLOC) = OFFSET3 !KM=0 is transformed in double precision, no need to store here IF (KM /= 0) THEN OFFSET1 = OFFSET1 + ALIGN(G%NDGLU(KM),8) ! N_OFFSET takes the max of the two GEMMs OFFSET2 = OFFSET2 + ALIGN((R%NSMAX-KM+3)/2,8) + + D%LEGENDRE_MATRIX_STRIDES(KMLOC) = ALIGN(G%NDGLU(KM),8) + ! Note that both sizes have to be aligned because we make the GEMMs + ! multiples of 8 + OFFSET3 = OFFSET3 + ALIGN((R%NSMAX-KM+3)/2,8) * D%LEGENDRE_MATRIX_STRIDES(KMLOC) ENDIF ENDDO D%OFFSETS_GEMM1(D%NUMP+1) = OFFSET1 D%OFFSETS_GEMM2(D%NUMP+1) = OFFSET2 +D%OFFSETS_GEMM_MATRIX(D%NUMP+1) = OFFSET3 ! ------------------------------------------------------------------ 9 FORMAT(1X,'ARRAY ',A10,' ALLOCATED ',8I8) diff --git a/src/trans/gpu/internal/tpm_fields_gpu.F90 b/src/trans/gpu/internal/tpm_fields_gpu.F90 index 09baef270..7f0694b57 100644 --- a/src/trans/gpu/internal/tpm_fields_gpu.F90 +++ b/src/trans/gpu/internal/tpm_fields_gpu.F90 @@ -19,8 +19,8 @@ MODULE TPM_FIELDS_GPU TYPE FIELDS_GPU_TYPE ! scratch arrays for ltinv and ltdir and associated dimension variables -REAL(KIND=JPRBT),ALLOCATABLE :: ZAA(:,:,:) !! JPRL for 1/2 -REAL(KIND=JPRBT),ALLOCATABLE :: ZAS(:,:,:) !! JPRL for 1/2 +REAL(KIND=JPRBT),ALLOCATABLE :: ZAA(:) !! JPRL for 1/2 +REAL(KIND=JPRBT),ALLOCATABLE :: ZAS(:) !! JPRL for 1/2 ! for m=0 in ledir_mod: REAL(KIND=JPRD),ALLOCATABLE :: ZAA0(:,:)