From 8622da102316fb9bab2134cea9e554a164c90a13 Mon Sep 17 00:00:00 2001 From: Lukas Mosimann Date: Mon, 24 Jun 2024 03:15:45 -0700 Subject: [PATCH 1/3] fix overflows --- src/trans/common/internal/tpm_distr.F90 | 10 +-- .../gpu/algor/buffered_allocator_mod.F90 | 32 ++++++--- src/trans/gpu/algor/ext_acc.F90 | 20 +++--- src/trans/gpu/algor/hicblas_cutlass.cuda.h | 15 +++-- src/trans/gpu/algor/hicblas_gemm.hip.cpp | 35 +++++----- src/trans/gpu/algor/hicblas_mod.F90 | 24 +++---- src/trans/gpu/algor/hicfft.hip.cpp | 18 ++--- src/trans/gpu/external/setup_trans.F90 | 6 +- src/trans/gpu/internal/fsc_mod.F90 | 18 ++--- src/trans/gpu/internal/ftdir_mod.F90 | 6 +- src/trans/gpu/internal/ftinv_mod.F90 | 6 +- src/trans/gpu/internal/ledir_mod.F90 | 17 +++-- src/trans/gpu/internal/leinv_mod.F90 | 17 +++-- src/trans/gpu/internal/ltdir_mod.F90 | 30 +++++---- src/trans/gpu/internal/ltinv_mod.F90 | 52 ++++++++------- src/trans/gpu/internal/sump_trans_mod.F90 | 5 +- src/trans/gpu/internal/tpm_hicfft.F90 | 42 +++++++----- src/trans/gpu/internal/trgtol_mod.F90 | 66 ++++++++++++------- src/trans/gpu/internal/trltog_mod.F90 | 62 ++++++++++------- src/trans/gpu/internal/trltom_mod.F90 | 22 ++++--- src/trans/gpu/internal/trltom_pack_unpack.F90 | 53 ++++++++------- src/trans/gpu/internal/trmtol_mod.F90 | 22 ++++--- src/trans/gpu/internal/trmtol_pack_unpack.F90 | 37 ++++++----- 23 files changed, 349 insertions(+), 266 deletions(-) diff --git a/src/trans/common/internal/tpm_distr.F90 b/src/trans/common/internal/tpm_distr.F90 index 6a151192f..598bae4ec 100755 --- a/src/trans/common/internal/tpm_distr.F90 +++ b/src/trans/common/internal/tpm_distr.F90 @@ -12,7 +12,7 @@ MODULE TPM_DISTR ! Module for distributed memory environment. -USE EC_PARKIND ,ONLY : JPIM ,JPRD +USE EC_PARKIND ,ONLY : JPIM ,JPRD, JPIB IMPLICIT NONE @@ -97,7 +97,7 @@ MODULE TPM_DISTR INTEGER(KIND=JPIM) :: NDGL_FS ! Number of rows of latitudes for which this process is ! performing Fourier Space calculations -INTEGER(KIND=JPIM) ,ALLOCATABLE :: NSTAGTF(:) ! Offset for specific latitude in +INTEGER(KIND=JPIB) ,ALLOCATABLE :: NSTAGTF(:) ! Offset for specific latitude in ! Fourier/gridpoint buffer INTEGER(KIND=JPIM) :: NLENGTF ! Second dimension of Fourier/gridpoint buffer ! (sum of (NLOEN+3) over local latitudes) @@ -171,7 +171,7 @@ MODULE TPM_DISTR REAL(KIND=JPRD) ,ALLOCATABLE :: RWEIGHT(:) ! Weight per grid-point (if weighted distribution) INTEGER(KIND=JPIM) ,ALLOCATABLE :: NPROCA_GP(:) ! Number of grid-points per a-set -INTEGER(KIND=JPIM), ALLOCATABLE :: OFFSETS_GEMM1(:), OFFSETS_GEMM2(:) +INTEGER(KIND=JPIB), ALLOCATABLE :: OFFSETS_GEMM1(:), OFFSETS_GEMM2(:) END TYPE DISTR_TYPE @@ -188,7 +188,7 @@ MODULE TPM_DISTR INTEGER(KIND=JPIM) ,ALLOCATABLE :: D_NPROCL(:) ! Process responsible for each lat. (F.S) INTEGER(KIND=JPIM) ,ALLOCATABLE :: D_NPNTGTB1(:,:) INTEGER(KIND=JPIM) ,ALLOCATABLE :: D_NASM0(:) ! Address in a spectral array of (m, n=m) -INTEGER(KIND=JPIM) ,ALLOCATABLE :: D_NSTAGTF(:) ! Offset for specific latitude in +INTEGER(KIND=JPIB) ,ALLOCATABLE :: D_NSTAGTF(:) ! Offset for specific latitude in INTEGER(KIND=JPIM) :: D_NDGL_FS ! Number of rows of latitudes for which this process is ! performing Fourier Space calculations INTEGER(KIND=JPIM) ,ALLOCATABLE :: D_MSTABF(:) @@ -200,7 +200,7 @@ MODULE TPM_DISTR ! The offsets in the input and output arrays to the gemms. ! (1) are the offsets in the "inputs" of dirtrans ("outputs" invtrans) ! (2) are the offsets in the "outputs" of invtrans ("inputs" dirtrans) -INTEGER(KIND=JPIM), POINTER :: D_OFFSETS_GEMM1(:), D_OFFSETS_GEMM2(:) +INTEGER(KIND=JPIB), POINTER :: D_OFFSETS_GEMM1(:), D_OFFSETS_GEMM2(:) END MODULE TPM_DISTR diff --git a/src/trans/gpu/algor/buffered_allocator_mod.F90 b/src/trans/gpu/algor/buffered_allocator_mod.F90 index 34b9c42aa..ba613fe7a 100644 --- a/src/trans/gpu/algor/buffered_allocator_mod.F90 +++ b/src/trans/gpu/algor/buffered_allocator_mod.F90 @@ -69,10 +69,11 @@ FUNCTION MAKE_BUFFERED_ALLOCATOR() MAKE_BUFFERED_ALLOCATOR%NEXT_BUF = 0 END FUNCTION MAKE_BUFFERED_ALLOCATOR - FUNCTION RESERVE(ALLOCATOR, SZ) + FUNCTION RESERVE(ALLOCATOR, SZ, WHO) IMPLICIT NONE TYPE(BUFFERED_ALLOCATOR), INTENT(INOUT) :: ALLOCATOR INTEGER(KIND=C_SIZE_T), INTENT(IN) :: SZ + CHARACTER(*), INTENT(IN), OPTIONAL :: WHO TYPE(ALLOCATION_RESERVATION_HANDLE) :: RESERVE @@ -88,7 +89,7 @@ SUBROUTINE INSTANTIATE_ALLOCATOR(ALLOCATOR, GROWING_ALLOCATION) IMPLICIT NONE TYPE(BUFFERED_ALLOCATOR), INTENT(INOUT) :: ALLOCATOR !!TYPE(GROWING_ALLOCATION_TYPE), INTENT(IN), POINTER :: GROWING_ALLOCATION - TYPE(GROWING_ALLOCATION_TYPE), target, INTENT(INout) :: GROWING_ALLOCATION + TYPE(GROWING_ALLOCATION_TYPE), TARGET, INTENT(INOUT) :: GROWING_ALLOCATION INTEGER :: I DO I = 0, NBUF-1 @@ -126,10 +127,13 @@ SUBROUTINE ASSIGN_PTR_FLOAT(DST, SRC, START_IN_BYTES, LENGTH_IN_BYTES, SET_VALUE INTEGER(KIND=4), INTENT(IN), OPTIONAL :: SET_STREAM LOGICAL :: SET_VALUE_EFF INTEGER(KIND=4) :: SET_STREAM_EFF - INTEGER(KIND=C_SIZE_T) :: START_IN_BYTES, LENGTH_IN_BYTES + INTEGER(KIND=C_SIZE_T) :: START_IN_BYTES, LENGTH_IN_BYTES, END_IN_BYTES, J IF (START_IN_BYTES + LENGTH_IN_BYTES - 1 > SIZE(SRC, KIND=C_SIZE_T)) THEN CALL ABORT_TRANS("Logical Error in ASSIGN_PTR - OOB assignment") ENDIF + IF (START_IN_BYTES < 1) THEN + CALL ABORT_TRANS("Logical Error in ASSIGN_PTR - OOB assignment") + ENDIF IF (PRESENT(SET_VALUE)) THEN SET_VALUE_EFF = SET_VALUE ELSE @@ -143,9 +147,11 @@ SUBROUTINE ASSIGN_PTR_FLOAT(DST, SRC, START_IN_BYTES, LENGTH_IN_BYTES, SET_VALUE IF (SET_VALUE_EFF .AND. LENGTH_IN_BYTES > 0) THEN ! This option is turned off by default, but for experimentation we can turn it on. This is ! setting all bits to 1 (meaning NaN in floating point) - !$ACC KERNELS PRESENT(SRC) ASYNC(SET_STREAM_EFF) - SRC(START_IN_BYTES:START_IN_BYTES+LENGTH_IN_BYTES-1) = -1 - !$ACC END KERNELS!! LOOP + !$ACC PARALLEL PRESENT(SRC) ASYNC(SET_STREAM_EFF) + DO J=1_C_SIZE_T,LENGTH_IN_BYTES + SRC(J) = -1 + ENDDO + !$ACC END PARALLEL ENDIF CALL C_F_POINTER(C_LOC(SRC(START_IN_BYTES:START_IN_BYTES+LENGTH_IN_BYTES-1)), DST, & & [C_SIZEOF(SRC(START_IN_BYTES:START_IN_BYTES+LENGTH_IN_BYTES-1))/C_SIZEOF(DST(0))]) @@ -159,10 +165,13 @@ SUBROUTINE ASSIGN_PTR_DOUBLE(DST, SRC, START_IN_BYTES, LENGTH_IN_BYTES, SET_VALU INTEGER(KIND=4), INTENT(IN), OPTIONAL :: SET_STREAM LOGICAL :: SET_VALUE_EFF INTEGER(KIND=4) :: SET_STREAM_EFF - INTEGER(KIND=C_SIZE_T) :: START_IN_BYTES, LENGTH_IN_BYTES + INTEGER(KIND=C_SIZE_T) :: START_IN_BYTES, LENGTH_IN_BYTES, END_IN_BYTES, J IF (START_IN_BYTES + LENGTH_IN_BYTES - 1 > SIZE(SRC, KIND=C_SIZE_T)) THEN CALL ABORT_TRANS("Logical Error in ASSIGN_PTR - OOB assignment") ENDIF + IF (START_IN_BYTES < 1) THEN + CALL ABORT_TRANS("Logical Error in ASSIGN_PTR - OOB assignment") + ENDIF IF (PRESENT(SET_VALUE)) THEN SET_VALUE_EFF = SET_VALUE ELSE @@ -176,9 +185,12 @@ SUBROUTINE ASSIGN_PTR_DOUBLE(DST, SRC, START_IN_BYTES, LENGTH_IN_BYTES, SET_VALU IF (SET_VALUE_EFF .AND. LENGTH_IN_BYTES > 0) THEN ! This option is turned off by default, but for experimentation we can turn it on. This is ! setting all bits to 1 (meaning NaN in floating point) - !$ACC KERNELS PRESENT(SRC) ASYNC(SET_STREAM_EFF) - SRC(START_IN_BYTES:START_IN_BYTES+LENGTH_IN_BYTES-1) = -1 - !$ACC END KERNELS!! LOOP + END_IN_BYTES=START_IN_BYTES+LENGTH_IN_BYTES-1 + !$ACC PARALLEL PRESENT(SRC) ASYNC(SET_STREAM_EFF) + DO J=1_C_SIZE_T,LENGTH_IN_BYTES + SRC(J) = -1 + ENDDO + !$ACC END PARALLEL ENDIF CALL C_F_POINTER(C_LOC(SRC(START_IN_BYTES:START_IN_BYTES+LENGTH_IN_BYTES-1)), DST, & & [C_SIZEOF(SRC(START_IN_BYTES:START_IN_BYTES+LENGTH_IN_BYTES-1))/C_SIZEOF(DST(0))]) diff --git a/src/trans/gpu/algor/ext_acc.F90 b/src/trans/gpu/algor/ext_acc.F90 index 021f36e2c..e550bdad7 100644 --- a/src/trans/gpu/algor/ext_acc.F90 +++ b/src/trans/gpu/algor/ext_acc.F90 @@ -18,7 +18,7 @@ module openacc_ext_type end module module openacc_ext use iso_c_binding, only: c_ptr, c_size_t, c_loc, c_sizeof - use openacc, only: acc_create, acc_copyin, acc_handle_kind + use openacc, only: acc_handle_kind use openacc_ext_type, only: ext_acc_arr_desc implicit none @@ -247,7 +247,7 @@ function get_common_pointers(in_ptrs, out_ptrs) result(num_ranges) enddo end function subroutine ext_acc_create(ptrs, stream) - use openacc, only: acc_create, acc_async_sync + use openacc, only: acc_async_sync use iso_fortran_env, only: int32 implicit none type(ext_acc_arr_desc), intent(in) :: ptrs(:) @@ -269,8 +269,7 @@ subroutine ext_acc_create(ptrs, stream) do i = 1, num_ranges call c_f_pointer(common_ptrs(i)%ptr, pp, shape=[common_ptrs(i)%sz/c_sizeof(pp(1))]) - !!call acc_create_async(pp, common_ptrs(i)%sz, async=stream_act) - call acc_create(pp, int(common_ptrs(i)%sz)) + !$acc enter data create(pp) async(stream_act) enddo end subroutine subroutine ext_acc_copyin(ptrs, stream) @@ -296,12 +295,11 @@ subroutine ext_acc_copyin(ptrs, stream) do i = 1, num_ranges call c_f_pointer(common_ptrs(i)%ptr, pp, shape=[common_ptrs(i)%sz/c_sizeof(pp(1))]) - !!call acc_copyin_async(pp, common_ptrs(i)%sz, async=stream_act) - call acc_copyin(pp, int(common_ptrs(i)%sz)) + !$acc enter data copyin(pp) async(stream_act) enddo end subroutine subroutine ext_acc_copyout(ptrs, stream) - use openacc, only: acc_async_sync, acc_copyout + use openacc, only: acc_async_sync implicit none type(ext_acc_arr_desc), intent(in) :: ptrs(:) integer(acc_handle_kind), optional :: stream @@ -323,12 +321,11 @@ subroutine ext_acc_copyout(ptrs, stream) do i = 1, num_ranges call c_f_pointer(common_ptrs(i)%ptr, pp, shape=[common_ptrs(i)%sz/c_sizeof(pp(1))]) - !!call acc_copyout_async(pp, common_ptrs(i)%sz, async=stream_act) - call acc_copyout(pp, int(common_ptrs(i)%sz)) + !$acc exit data copyout(pp) async(stream_act) enddo end subroutine subroutine ext_acc_delete(ptrs, stream) - use openacc, only: acc_async_sync, acc_delete + use openacc, only: acc_async_sync implicit none type(ext_acc_arr_desc), intent(in) :: ptrs(:) integer(acc_handle_kind), optional :: stream @@ -349,9 +346,8 @@ subroutine ext_acc_delete(ptrs, stream) num_ranges = get_common_pointers(ptrs, common_ptrs) do i = 1, num_ranges - call c_f_pointer(common_ptrs(i)%ptr, pp, shape=[common_ptrs(i)%sz/c_sizeof(pp(1))]) - !!call acc_delete_async(pp, common_ptrs(i)%sz, async=stream_act) call acc_delete(pp, int(common_ptrs(i)%sz)) + !$acc exit data delete(pp) async(stream_act) enddo end subroutine end module diff --git a/src/trans/gpu/algor/hicblas_cutlass.cuda.h b/src/trans/gpu/algor/hicblas_cutlass.cuda.h index 7a842a808..028b85c41 100644 --- a/src/trans/gpu/algor/hicblas_cutlass.cuda.h +++ b/src/trans/gpu/algor/hicblas_cutlass.cuda.h @@ -1,3 +1,6 @@ +// (C) Copyright 2000- ECMWF. +// (C) Copyright 2024- NVIDIA. + #ifdef USE_CUTLASS //#include "hicblas.h" #include "cutlass/gemm/device/gemm.h" @@ -151,9 +154,9 @@ class cutlass_sgemm_grouped { template void cutlass_sgemm_wrapper_grouped_op(int blas_id, int m, int *n, int *k, float alpha, const float *A, int lda, - int *offsetsA, const float *B, int ldb, - int *offsetsB, float beta, float *C, - int ldc, int *offsetsC, int batchCount, + int64_t *offsetsA, const float *B, int ldb, + int64_t *offsetsB, float beta, float *C, + int ldc, int64_t *offsetsC, int batchCount, cudaStream_t stream, void *growing_allocator) { using namespace detail; @@ -178,9 +181,9 @@ void cutlass_sgemm_wrapper_grouped_op(int blas_id, int m, int *n, int *k, void cutlass_sgemm_wrapper_grouped(int blas_id, char transa, char transb, int m, int *n, int *k, float alpha, - const float *A, int lda, int *offsetsA, - const float *B, int ldb, int *offsetsB, float beta, - float *C, int ldc, int *offsetsC, + const float *A, int lda, int64_t *offsetsA, + const float *B, int ldb, int64_t *offsetsB, float beta, + float *C, int ldc, int64_t *offsetsC, int batchCount, cudaStream_t stream, void *growing_allocator) { diff --git a/src/trans/gpu/algor/hicblas_gemm.hip.cpp b/src/trans/gpu/algor/hicblas_gemm.hip.cpp index 9d6178bed..1c0aaf36a 100644 --- a/src/trans/gpu/algor/hicblas_gemm.hip.cpp +++ b/src/trans/gpu/algor/hicblas_gemm.hip.cpp @@ -1,4 +1,5 @@ // (C) Copyright 2000- ECMWF. +// (C) Copyright 2024- NVIDIA. // // This software is licensed under the terms of the Apache Licence Version 2.0 // which can be obtained at http://www.apache.org/licenses/LICENSE-2.0. @@ -64,9 +65,9 @@ template void free_gemm_cache(float *, size_t) { // this version is using graphs and caches the graphs template void run_group_graph(Gemm &&gemm, int m, int *n, int *k, Real alpha, - const Real *A, int lda, int *offsetsA, const Real *B, - int ldb, int *offsetsB, Real beta, Real *C, int ldc, - int *offsetsC, int batchCount, hipStream_t stream, + const Real *A, int lda, int64_t *offsetsA, const Real *B, + int ldb, int64_t *offsetsB, Real beta, Real *C, int ldc, + int64_t *offsetsC, int batchCount, hipStream_t stream, int blas_id, void *growing_allocator) { growing_allocator_register_free_c(growing_allocator, free_gemm_cache); @@ -133,8 +134,8 @@ void run_group_graph(Gemm &&gemm, int m, int *n, int *k, Real alpha, // stupid simple gemm calls template void run_group(Gemm &&gemm, int m, int *n, int *k, Real alpha, const Real *A, - int lda, int *offsetsA, const Real *B, int ldb, int *offsetsB, - Real beta, Real *C, int ldc, int *offsetsC, int batchCount, + int lda, int64_t *offsetsA, const Real *B, int ldb, int64_t *offsetsB, + Real beta, Real *C, int ldc, int64_t *offsetsC, int batchCount, hipStream_t stream, int = -1) { for (int i = 0; i < batchCount; ++i) { if (m == 0 || n[i] == 0 || k[i] == 0) @@ -186,9 +187,9 @@ template struct hipblas_gemm_grouped { void hipblas_sgemm_wrapper_grouped(int blas_id, char transa, char transb, int m, int *n, int *k, float alpha, - const float *A, int lda, int *offsetsA, - const float *B, int ldb, int *offsetsB, float beta, - float *C, int ldc, int *offsetsC, + const float *A, int lda, int64_t *offsetsA, + const float *B, int ldb, int64_t *offsetsB, float beta, + float *C, int ldc, int64_t *offsetsC, int batchCount, hipStream_t stream, void *growing_allocator) { @@ -215,10 +216,10 @@ void hipblas_sgemm_wrapper_grouped(int blas_id, char transa, char transb, void hipblas_dgemm_wrapper_grouped(int blas_id, char transa, char transb, int m, int *n, int *k, double alpha, - const double *A, int lda, int *offsetsA, - const double *B, int ldb, int *offsetsB, + const double *A, int lda, int64_t *offsetsA, + const double *B, int ldb, int64_t *offsetsB, double beta, - double *C, int ldc, int *offsetsC, + double *C, int ldc, int64_t *offsetsC, int batchCount, hipStream_t stream, void *) { hipblasOperation_t op_t1=HIPBLAS_OP_N, op_t2=HIPBLAS_OP_N; @@ -292,9 +293,9 @@ void hipblas_sgemm_wrapper (char transa, char transb, void hipblas_sgemm_wrapper_grouped(int blas_id, char transa, char transb, int m, int *n, int *k, float alpha, - const float *A, int lda, int *offsetsA, - const float *B, int ldb, int *offsetsB, float beta, - float *C, int ldc, int *offsetsC, + const float *A, int lda, int64_t *offsetsA, + const float *B, int ldb, int64_t *offsetsB, float beta, + float *C, int ldc, int64_t *offsetsC, int batchCount, size_t stream, void *growing_allocator) { #ifdef USE_CUTLASS @@ -313,9 +314,9 @@ void hipblas_sgemm_wrapper_grouped(int blas_id, char transa, char transb, void hipblas_dgemm_wrapper_grouped(int blas_id, char transa, char transb, int m, int *n, int *k, double alpha, - const double *A, int lda, int *offsetsA, - const double *B, int ldb, int *offsetsB, double beta, - double *C, int ldc, int *offsetsC, + const double *A, int lda, int64_t *offsetsA, + const double *B, int ldb, int64_t *offsetsB, double beta, + double *C, int ldc, int64_t *offsetsC, int batchCount, size_t stream, void *growing_allocator) { hipblas_dgemm_wrapper_grouped(blas_id, transa, transb, m, n, k, alpha, A, lda, offsetsA, B, diff --git a/src/trans/gpu/algor/hicblas_mod.F90 b/src/trans/gpu/algor/hicblas_mod.F90 index 988e1b3ef..186a87ef8 100644 --- a/src/trans/gpu/algor/hicblas_mod.F90 +++ b/src/trans/gpu/algor/hicblas_mod.F90 @@ -14,7 +14,7 @@ MODULE HICBLAS_MOD -USE EC_PARKIND, ONLY: JPIM, JPRM, JPRD +USE EC_PARKIND, ONLY: JPIM, JPRM, JPRD, JPIB USE GROWING_ALLOCATOR_MOD, ONLY: GROWING_ALLOCATION_TYPE USE OPENACC_LIB, ONLY: ACC_GET_HIP_STREAM @@ -75,10 +75,11 @@ SUBROUTINE HIP_DGEMM_GROUPED( & & C, LDC, OFFSETC, & & BATCHCOUNT, STREAM, ALLOC & &) BIND(C, NAME='hipblas_dgemm_wrapper_grouped') - USE ISO_C_BINDING, ONLY: C_CHAR, C_INT, C_DOUBLE, C_SIZE_T, C_PTR + USE ISO_C_BINDING, ONLY: C_CHAR, C_INT, C_DOUBLE, C_SIZE_T, C_PTR, C_INT64_T CHARACTER(1,C_CHAR), VALUE :: CTA, CTB INTEGER(C_INT), VALUE :: BLAS_ID, M, LDA, LDB, LDC, BATCHCOUNT - INTEGER(C_INT) :: N(*), K(*), OFFSETA(*), OFFSETB(*), OFFSETC(*) + INTEGER(C_INT) :: N(*), K(*) + INTEGER(C_INT64_T) :: OFFSETA(*), OFFSETB(*), OFFSETC(*) REAL(C_DOUBLE), VALUE :: ALPHA,BETA REAL(C_DOUBLE) :: A(*), B(*), C(*) INTEGER(KIND=C_SIZE_T) :: STREAM @@ -95,10 +96,11 @@ SUBROUTINE HIP_SGEMM_GROUPED( & & C, LDC, OFFSETC, & & BATCHCOUNT, STREAM, ALLOC & &) BIND(C, NAME='hipblas_sgemm_wrapper_grouped') - USE ISO_C_BINDING, ONLY: C_CHAR, C_INT, C_FLOAT, C_SIZE_T, C_PTR + USE ISO_C_BINDING, ONLY: C_CHAR, C_INT, C_FLOAT, C_SIZE_T, C_PTR, C_INT64_T CHARACTER(1,C_CHAR), VALUE :: CTA, CTB INTEGER(C_INT), VALUE :: BLAS_ID, M, LDA, LDB, LDC, BATCHCOUNT - INTEGER(C_INT) :: N(*), K(*), OFFSETA(*), OFFSETB(*), OFFSETC(*) + INTEGER(C_INT) :: N(*), K(*) + INTEGER(C_INT64_T) :: OFFSETA(*), OFFSETB(*), OFFSETC(*) REAL(C_FLOAT), VALUE :: ALPHA,BETA REAL(C_FLOAT) :: A(*), B(*), C(*) INTEGER(KIND=C_SIZE_T) :: STREAM @@ -220,14 +222,14 @@ SUBROUTINE HIP_DGEMM_GROUPED_OVERLOAD( & REAL(KIND=JPRD) :: ALPHA REAL(KIND=JPRD), DIMENSION(:) :: AARRAY INTEGER(KIND=JPIM) :: LDA - INTEGER(KIND=JPIM) :: OFFSETA(:) + INTEGER(KIND=JPIB) :: OFFSETA(:) REAL(KIND=JPRD), DIMENSION(*) :: BARRAY INTEGER(KIND=JPIM) :: LDB - INTEGER(KIND=JPIM) :: OFFSETB(:) + INTEGER(KIND=JPIB) :: OFFSETB(:) REAL(KIND=JPRD) :: BETA REAL(KIND=JPRD), DIMENSION(:) :: CARRAY INTEGER(KIND=JPIM) :: LDC - INTEGER(KIND=JPIM) :: OFFSETC(:) + INTEGER(KIND=JPIB) :: OFFSETC(:) INTEGER(KIND=JPIM) :: BATCHCOUNT INTEGER(KIND=C_INT) :: STREAM TYPE(GROWING_ALLOCATION_TYPE), INTENT(IN) :: ALLOC @@ -266,14 +268,14 @@ SUBROUTINE HIP_SGEMM_GROUPED_OVERLOAD(& REAL(KIND=JPRM) :: ALPHA REAL(KIND=JPRM), DIMENSION(:) :: AARRAY INTEGER(KIND=JPIM) :: LDA - INTEGER(KIND=JPIM) :: OFFSETA(:) + INTEGER(KIND=JPIB) :: OFFSETA(:) REAL(KIND=JPRM), DIMENSION(:,:,:) :: BARRAY INTEGER(KIND=JPIM) :: LDB - INTEGER(KIND=JPIM) :: OFFSETB(:) + INTEGER(KIND=JPIB) :: OFFSETB(:) REAL(KIND=JPRM) :: BETA REAL(KIND=JPRM), DIMENSION(:) :: CARRAY INTEGER(KIND=JPIM) :: LDC - INTEGER(KIND=JPIM) :: OFFSETC(:) + INTEGER(KIND=JPIB) :: OFFSETC(:) INTEGER(KIND=JPIM) :: BATCHCOUNT INTEGER(KIND=C_INT) :: STREAM TYPE(GROWING_ALLOCATION_TYPE), INTENT(IN) :: ALLOC diff --git a/src/trans/gpu/algor/hicfft.hip.cpp b/src/trans/gpu/algor/hicfft.hip.cpp index 8278d9a4b..a7198f415 100644 --- a/src/trans/gpu/algor/hicfft.hip.cpp +++ b/src/trans/gpu/algor/hicfft.hip.cpp @@ -54,12 +54,12 @@ template class hicfft_plan { void set_stream(hipStream_t stream) { fftSafeCall(hipfftSetStream(handle, stream)); } - hicfft_plan(hipfftHandle handle_, int offset_) + hicfft_plan(hipfftHandle handle_, int64_t offset_) : handle(handle_), offset(offset_) {} private: hipfftHandle handle; - int offset; + int64_t offset; }; // kfield -> handles @@ -89,7 +89,7 @@ void free_fft_cache(float *, size_t) { template std::vector> plan_all(int kfield, int *loens, - int nfft, int *offsets) { + int nfft, int64_t *offsets) { static constexpr bool is_forward = Direction == HIPFFT_R2C || Direction == HIPFFT_D2Z; @@ -120,7 +120,7 @@ std::vector> plan_all(int kfield, int *loens, template void run_group_graph(typename Type::real *data_real, typename Type::cmplx *data_complex, int kfield, int *loens, - int *offsets, int nfft, void *growing_allocator) { + int64_t *offsets, int nfft, void *growing_allocator) { growing_allocator_register_free_c(growing_allocator, free_fft_cache); @@ -183,7 +183,7 @@ void run_group_graph(typename Type::real *data_real, template void run_group(typename Type::real *data_real, typename Type::cmplx *data_complex, int kfield, int *loens, - int *offsets, int nfft, void *growing_allocator) { + int64_t *offsets, int nfft, void *growing_allocator) { auto plans = plan_all(kfield, loens, nfft, offsets); for (auto &plan : plans) @@ -199,27 +199,27 @@ extern "C" { #define RUN run_group #endif void execute_dir_fft_float(float *data_real, hipfftComplex *data_complex, - int kfield, int *loens, int *offsets, int nfft, + int kfield, int *loens, int64_t *offsets, int nfft, void *growing_allocator) { RUN(data_real, data_complex, kfield, loens, offsets, nfft, growing_allocator); } void execute_inv_fft_float(hipfftComplex *data_complex, float *data_real, - int kfield, int *loens, int *offsets, int nfft, + int kfield, int *loens, int64_t *offsets, int nfft, void *growing_allocator) { RUN(data_real, data_complex, kfield, loens, offsets, nfft, growing_allocator); } void execute_dir_fft_double(double *data_real, hipfftDoubleComplex *data_complex, int kfield, - int *loens, int *offsets, int nfft, + int *loens, int64_t *offsets, int nfft, void *growing_allocator) { RUN(data_real, data_complex, kfield, loens, offsets, nfft, growing_allocator); } void execute_inv_fft_double(hipfftDoubleComplex *data_complex, double *data_real, int kfield, int *loens, - int *offsets, int nfft, void *growing_allocator) { + int64_t *offsets, int nfft, void *growing_allocator) { RUN(data_real, data_complex, kfield, loens, offsets, nfft, growing_allocator); } diff --git a/src/trans/gpu/external/setup_trans.F90 b/src/trans/gpu/external/setup_trans.F90 index bf1cec9db..dacf092c5 100755 --- a/src/trans/gpu/external/setup_trans.F90 +++ b/src/trans/gpu/external/setup_trans.F90 @@ -98,7 +98,7 @@ SUBROUTINE SETUP_TRANS(KSMAX,KDGL,KDLON,KLOEN,LDSPLIT,PSTRET,& ! R. El Khatib 07-Mar-2016 Better flexibility for Legendre polynomials computation in stretched mode ! ------------------------------------------------------------------ -USE PARKIND1, ONLY: JPIM, JPRB, JPRD +USE PARKIND1, ONLY: JPIM, JPRB, JPRD, JPIB USE PARKIND_ECTRANS, ONLY: JPRBT !ifndef INTERFACE @@ -503,8 +503,8 @@ SUBROUTINE SETUP_TRANS(KSMAX,KDGL,KDLON,KLOEN,LDSPLIT,PSTRET,& ALLOCATE(ZAS(ALIGN(R%NDGNH,8),ALIGN((R%NTMAX+3)/2,8),D%NUMP)) WRITE(NOUT,*)'setup_trans: sizes1 NUMP=',D%NUMP -WRITE(NOUT,*)'ZAS:',size(ZAS) -WRITE(NOUT,*)'ZAA:',size(ZAA) +WRITE(NOUT,*)'ZAS:',SIZE(ZAS,KIND=JPIB) +WRITE(NOUT,*)'ZAA:',SIZE(ZAA,KIND=JPIB) ZAA(:,:,:) = 0._JPRBT ZAS(:,:,:) = 0._JPRBT diff --git a/src/trans/gpu/internal/fsc_mod.F90 b/src/trans/gpu/internal/fsc_mod.F90 index 2254af6fb..544d78eb6 100755 --- a/src/trans/gpu/internal/fsc_mod.F90 +++ b/src/trans/gpu/internal/fsc_mod.F90 @@ -11,7 +11,7 @@ MODULE FSC_MOD USE BUFFERED_ALLOCATOR_MOD, ONLY: BUFFERED_ALLOCATOR - USE PARKIND_ECTRANS, ONLY: JPIM, JPRBT + USE PARKIND_ECTRANS, ONLY: JPIM, JPRBT, JPIB USE TPM_DISTR, ONLY: D IMPLICIT NONE @@ -82,10 +82,10 @@ SUBROUTINE FSC(ALLOCATOR,HFSC,PREEL_COMPLEX, KF_FS, KF_UV, KF_SCALARS, KUV_OFFSE INTEGER(KIND=JPIM) :: KGL REAL(KIND=JPRBT) :: ZACHTE2 -REAL(KIND=JPRBT) :: ZAMP, ZPHASE -INTEGER(KIND=JPIM) :: IOFF_LAT,OFFSET_VAR -INTEGER(KIND=JPIM) :: IOFF_SCALARS,IOFF_SCALARS_EWDER,IOFF_UV,IOFF_UV_EWDER,IOFF_KSCALARS_NSDER -INTEGER(KIND=JPIM) :: JF,IGLG,II,JM +INTEGER(KIND=JPIM) :: OFFSET_VAR +INTEGER(KIND=JPIB) :: IOFF_LAT +INTEGER(KIND=JPIB) :: IOFF_SCALARS,IOFF_SCALARS_EWDER,IOFF_UV,IOFF_UV_EWDER,IOFF_KSCALARS_NSDER +INTEGER(KIND=JPIM) :: JF,IGLG,JM INTEGER(KIND=JPIM) :: IBEG,IEND,IINC REAL(KIND=JPRBT) :: RET_REAL, RET_COMPLEX @@ -131,7 +131,7 @@ SUBROUTINE FSC(ALLOCATOR,HFSC,PREEL_COMPLEX, KF_FS, KF_UV, KF_SCALARS, KUV_OFFSE DO JM=0,R_NSMAX !(note that R_NSMAX <= G_NMEN(IGLG) for all IGLG) IGLG = OFFSET_VAR+KGL-1 IF (JM <= G_NMEN(IGLG)) THEN - IOFF_LAT = KF_FS*D_NSTAGTF(KGL) + IOFF_LAT = 1_JPIB*KF_FS*D_NSTAGTF(KGL) IOFF_UV = IOFF_LAT+(KUV_OFFSET+JF-1)*(D_NSTAGTF(KGL+1)-D_NSTAGTF(KGL)) ZACHTE2 = REAL(F_RACTHE(IGLG),JPRBT) @@ -160,7 +160,7 @@ SUBROUTINE FSC(ALLOCATOR,HFSC,PREEL_COMPLEX, KF_FS, KF_UV, KF_SCALARS, KUV_OFFSE DO JM=0,R_NSMAX !(note that R_NSMAX <= G_NMEN(IGLG) for all IGLG) IGLG = OFFSET_VAR+KGL-1 IF (JM <= G_NMEN(IGLG)) THEN - IOFF_LAT = KF_FS*D_NSTAGTF(KGL) + IOFF_LAT = 1_JPIB*KF_FS*D_NSTAGTF(KGL) IOFF_KSCALARS_NSDER = IOFF_LAT+(KSCALARS_NSDER_OFFSET+JF-1)*(D_NSTAGTF(KGL+1)-D_NSTAGTF(KGL)) ZACHTE2 = REAL(F_RACTHE(IGLG),JPRBT) @@ -198,7 +198,7 @@ SUBROUTINE FSC(ALLOCATOR,HFSC,PREEL_COMPLEX, KF_FS, KF_UV, KF_SCALARS, KUV_OFFSE ! to fill those floor(NLON/2)+1 values. ! Truncation happens starting at G_NMEN+1. Hence, we zero-fill those values. IF (JM <= G_NLOEN(IGLG)/2) THEN - IOFF_LAT = KF_FS*D_NSTAGTF(KGL) + IOFF_LAT = 1_JPIB*KF_FS*D_NSTAGTF(KGL) IOFF_UV = IOFF_LAT+(KUV_OFFSET+JF-1)*(D_NSTAGTF(KGL+1)-D_NSTAGTF(KGL)) IOFF_UV_EWDER = IOFF_LAT+(KUV_EWDER_OFFSET+JF-1)*(D_NSTAGTF(KGL+1)-D_NSTAGTF(KGL)) @@ -239,7 +239,7 @@ SUBROUTINE FSC(ALLOCATOR,HFSC,PREEL_COMPLEX, KF_FS, KF_UV, KF_SCALARS, KUV_OFFSE ! to fill those floor(NLON/2)+1 values. ! Truncation happens starting at G_NMEN+1. Hence, we zero-fill those values. IF (JM <= G_NLOEN(IGLG)/2) THEN - IOFF_LAT = KF_FS*D_NSTAGTF(KGL) + IOFF_LAT = 1_JPIB*KF_FS*D_NSTAGTF(KGL) IOFF_SCALARS_EWDER = IOFF_LAT+(KSCALARS_EWDER_OFFSET+JF-1)*(D_NSTAGTF(KGL+1)-D_NSTAGTF(KGL)) IOFF_SCALARS = IOFF_LAT+(KSCALARS_OFFSET+JF-1)*(D_NSTAGTF(KGL+1)-D_NSTAGTF(KGL)) diff --git a/src/trans/gpu/internal/ftdir_mod.F90 b/src/trans/gpu/internal/ftdir_mod.F90 index b182a7e3a..e24c9c6c1 100755 --- a/src/trans/gpu/internal/ftdir_mod.F90 +++ b/src/trans/gpu/internal/ftdir_mod.F90 @@ -36,7 +36,7 @@ FUNCTION PREPARE_FTDIR(ALLOCATOR,KF_FS) RESULT(HFTDIR) REAL(KIND=JPRBT) :: DUMMY #ifndef IN_PLACE_FFT - HFTDIR%HREEL_COMPLEX = RESERVE(ALLOCATOR, INT(KF_FS*D%NLENGTF,KIND=C_SIZE_T)*C_SIZEOF(DUMMY)) + HFTDIR%HREEL_COMPLEX = RESERVE(ALLOCATOR, 1_JPIB*KF_FS*D%NLENGTF*C_SIZEOF(DUMMY), "HFTDIR%HREEL_COMPLEX") #endif END FUNCTION PREPARE_FTDIR @@ -74,7 +74,7 @@ SUBROUTINE FTDIR(ALLOCATOR,HFTDIR,PREEL_REAL,PREEL_COMPLEX,KFIELD) ! ------------------------------------------------------------------ USE TPM_GEN, ONLY: LSYNC_TRANS - USE PARKIND_ECTRANS, ONLY: JPIM, JPRBT + USE PARKIND_ECTRANS, ONLY: JPIM, JPRBT, JPIB USE TPM_DISTR, ONLY: MYSETW, MYPROC, NPROC, D_NSTAGT0B, D_NSTAGTF,D_NPTRLS, & & D_NPNTGTB0, D_NPROCM, D_NDGL_FS, D USE TPM_GEOMETRY, ONLY: G_NMEN, G_NLOEN @@ -98,7 +98,7 @@ SUBROUTINE FTDIR(ALLOCATOR,HFTDIR,PREEL_REAL,PREEL_COMPLEX,KFIELD) PREEL_COMPLEX => PREEL_REAL #else CALL ASSIGN_PTR(PREEL_COMPLEX, GET_ALLOCATION(ALLOCATOR, HFTDIR%HREEL_COMPLEX),& - & 1_C_SIZE_T, INT(KFIELD*D%NLENGTF,KIND=C_SIZE_T)*C_SIZEOF(PREEL_COMPLEX(1))) + & 1_JPIB, 1_JPIB*KFIELD*D%NLENGTF*C_SIZEOF(PREEL_COMPLEX(1))) #endif #ifdef ACCGPU diff --git a/src/trans/gpu/internal/ftinv_mod.F90 b/src/trans/gpu/internal/ftinv_mod.F90 index b6bb0a112..c55e7d933 100755 --- a/src/trans/gpu/internal/ftinv_mod.F90 +++ b/src/trans/gpu/internal/ftinv_mod.F90 @@ -35,7 +35,7 @@ FUNCTION PREPARE_FTINV(ALLOCATOR,KF_FS) RESULT(HFTINV) REAL(KIND=JPRBT) :: DUMMY #ifndef IN_PLACE_FFT - HFTINV%HREEL_REAL = RESERVE(ALLOCATOR, INT(D%NLENGTF*KF_FS,KIND=C_SIZE_T)*C_SIZEOF(DUMMY)) + HFTINV%HREEL_REAL = RESERVE(ALLOCATOR, 1_JPIB*D%NLENGTF*KF_FS*C_SIZEOF(DUMMY),"HFTINV%HREEL_REAL") #endif END FUNCTION @@ -73,7 +73,7 @@ SUBROUTINE FTINV(ALLOCATOR,HFTINV,PREEL_COMPLEX,PREEL_REAL,KFIELD) ! ------------------------------------------------------------------ USE TPM_GEN, ONLY: LSYNC_TRANS - USE PARKIND_ECTRANS, ONLY: JPIM, JPRBT + USE PARKIND_ECTRANS, ONLY: JPIM, JPRBT, JPIB USE TPM_DISTR, ONLY: MYSETW, D_NPTRLS, D_NDGL_FS, D_NSTAGTF, D USE TPM_GEOMETRY, ONLY: G_NLOEN USE TPM_HICFFT, ONLY: EXECUTE_INV_FFT @@ -96,7 +96,7 @@ SUBROUTINE FTINV(ALLOCATOR,HFTINV,PREEL_COMPLEX,PREEL_REAL,KFIELD) PREEL_REAL => PREEL_COMPLEX #else CALL ASSIGN_PTR(PREEL_REAL, GET_ALLOCATION(ALLOCATOR, HFTINV%HREEL_REAL),& - & 1_C_SIZE_T, INT(KFIELD*D%NLENGTF,KIND=C_SIZE_T)*C_SIZEOF(PREEL_REAL(1))) + & 1_JPIB, 1_JPIB*KFIELD*D%NLENGTF*C_SIZEOF(PREEL_REAL(1))) #endif #ifdef OMPGPU diff --git a/src/trans/gpu/internal/ledir_mod.F90 b/src/trans/gpu/internal/ledir_mod.F90 index e12f89afb..ad8e052ba 100755 --- a/src/trans/gpu/internal/ledir_mod.F90 +++ b/src/trans/gpu/internal/ledir_mod.F90 @@ -11,7 +11,7 @@ ! MODULE LEDIR_MOD - USE PARKIND_ECTRANS, ONLY: JPIM, JPRBT, JPRD + USE PARKIND_ECTRANS, ONLY: JPIM, JPRBT, JPRD, JPIB USE BUFFERED_ALLOCATOR_MOD, ONLY: BUFFERED_ALLOCATOR IMPLICIT NONE @@ -29,8 +29,10 @@ SUBROUTINE LEDIR_STRIDES(KF_FS,IOUT_STRIDES0,IOUT_SIZE,IIN_STRIDES0,IIN_SIZE,& INTEGER(KIND=JPIM), INTENT(IN) :: KF_FS - INTEGER(KIND=JPIM), OPTIONAL :: IOUT_STRIDES0, IOUT_SIZE - INTEGER(KIND=JPIM), OPTIONAL :: IIN_STRIDES0, IIN_SIZE + INTEGER(KIND=JPIM), OPTIONAL :: IOUT_STRIDES0 + INTEGER(KIND=JPIB), OPTIONAL :: IOUT_SIZE + INTEGER(KIND=JPIM), OPTIONAL :: IIN_STRIDES0 + INTEGER(KIND=JPIB), OPTIONAL :: IIN_SIZE INTEGER(KIND=JPIM), OPTIONAL :: IOUT0_STRIDES0, IOUT0_SIZE INTEGER(KIND=JPIM), OPTIONAL :: IIN0_STRIDES0, IIN0_SIZE @@ -127,15 +129,18 @@ SUBROUTINE LEDIR(ALLOCATOR,ZINPS,ZINPA,ZINPS0,ZINPA0,ZOUT,ZOUT0,POA1,KF_FS) INTEGER(KIND=JPIM) :: KM INTEGER(KIND=JPIM) :: KMLOC INTEGER(KIND=JPIM) :: IA, IS, ISL, J - INTEGER(KIND=JPIM) :: KS(D_NUMP), NS(D_NUMP), AOFFSETS(D_NUMP), BOFFSETS(D_NUMP), COFFSETS(D_NUMP) + INTEGER(KIND=JPIM) :: KS(D_NUMP), NS(D_NUMP) + INTEGER(KIND=JPIB) :: AOFFSETS(D_NUMP), BOFFSETS(D_NUMP), COFFSETS(D_NUMP) REAL(KIND=JPHOOK) :: ZHOOK_HANDLE REAL(KIND=JPRBT) :: PAIA, PAIS, V1, V2 INTEGER(KIND=JPIM) :: IGLS, JF, JGL INTEGER(KIND=JPIM) :: OFFSET1, OFFSET2 - INTEGER(KIND=JPIM) :: IOUT_STRIDES0, IOUT_STRIDES1 - INTEGER(KIND=JPIM) :: IIN_STRIDES0, IIN_STRIDES1 + INTEGER(KIND=JPIM) :: IOUT_STRIDES0 + INTEGER(KIND=JPIB) :: IOUT_STRIDES1 + INTEGER(KIND=JPIM) :: IIN_STRIDES0 + INTEGER(KIND=JPIB) :: IIN_STRIDES1 INTEGER(KIND=JPIM) :: IOUT0_STRIDES0, IOUT0_STRIDES1 INTEGER(KIND=JPIM) :: IIN0_STRIDES0, IIN0_STRIDES1 INTEGER(KIND=8) :: ALLOC_SZ, ALLOC_POS diff --git a/src/trans/gpu/internal/leinv_mod.F90 b/src/trans/gpu/internal/leinv_mod.F90 index 70a729ac0..66c5e87bb 100755 --- a/src/trans/gpu/internal/leinv_mod.F90 +++ b/src/trans/gpu/internal/leinv_mod.F90 @@ -11,7 +11,7 @@ ! MODULE LEINV_MOD - USE PARKIND_ECTRANS, ONLY: JPIM, JPRB, JPRBT, JPRD + USE PARKIND_ECTRANS, ONLY: JPIM, JPRB, JPRBT, JPRD, JPIB USE BUFFERED_ALLOCATOR_MOD, ONLY: BUFFERED_ALLOCATOR IMPLICIT NONE @@ -30,8 +30,10 @@ SUBROUTINE LEINV_STRIDES(KF_LEG,IOUT_STRIDES0,IOUT_SIZE,IIN_STRIDES0,IIN_SIZE,& INTEGER(KIND=JPIM), INTENT(IN) :: KF_LEG - INTEGER(KIND=JPIM), OPTIONAL :: IOUT_STRIDES0, IOUT_SIZE - INTEGER(KIND=JPIM), OPTIONAL :: IIN_STRIDES0, IIN_SIZE + INTEGER(KIND=JPIM), OPTIONAL :: IOUT_STRIDES0 + INTEGER(KIND=JPIB), OPTIONAL :: IOUT_SIZE + INTEGER(KIND=JPIM), OPTIONAL :: IIN_STRIDES0 + INTEGER(KIND=JPIB), OPTIONAL :: IIN_SIZE INTEGER(KIND=JPIM), OPTIONAL :: IOUT0_STRIDES0, IOUT0_SIZE INTEGER(KIND=JPIM), OPTIONAL :: IIN0_STRIDES0, IIN0_SIZE @@ -119,10 +121,13 @@ SUBROUTINE LEINV(ALLOCATOR,PIA,ZINP,ZINP0,ZOUTS,ZOUTA,ZOUTS0,ZOUTA0,KF_LEG) TYPE(BUFFERED_ALLOCATOR), INTENT(IN) :: ALLOCATOR ! LOCAL - INTEGER(KIND=JPIM) :: KS(D_NUMP), NS(D_NUMP), AOFFSETS(D_NUMP), BOFFSETS(D_NUMP), COFFSETS(D_NUMP) + INTEGER(KIND=JPIM) :: KS(D_NUMP), NS(D_NUMP) + INTEGER(KIND=JPIB) :: AOFFSETS(D_NUMP), BOFFSETS(D_NUMP), COFFSETS(D_NUMP) INTEGER(KIND=JPIM) :: KM, KMLOC, IA, IS, ISL, J1, JGL, JK, J - INTEGER(KIND=JPIM) :: IOUT_STRIDES0, IOUT_SIZE - INTEGER(KIND=JPIM) :: IIN_STRIDES0, IIN_SIZE + INTEGER(KIND=JPIM) :: IOUT_STRIDES0 + INTEGER(KIND=JPIB) :: IOUT_SIZE + INTEGER(KIND=JPIM) :: IIN_STRIDES0 + INTEGER(KIND=JPIB) :: IIN_SIZE INTEGER(KIND=JPIM) :: IOUT0_STRIDES0, IOUT0_SIZE INTEGER(KIND=JPIM) :: IIN0_STRIDES0, IIN0_SIZE diff --git a/src/trans/gpu/internal/ltdir_mod.F90 b/src/trans/gpu/internal/ltdir_mod.F90 index 293bf6734..567e76a89 100755 --- a/src/trans/gpu/internal/ltdir_mod.F90 +++ b/src/trans/gpu/internal/ltdir_mod.F90 @@ -11,7 +11,7 @@ ! MODULE LTDIR_MOD - USE PARKIND_ECTRANS, ONLY: JPIM, JPRBT, JPRB, JPRD + USE PARKIND_ECTRANS, ONLY: JPIM, JPRBT, JPRB, JPRD, JPIB USE BUFFERED_ALLOCATOR_MOD, ONLY: ALLOCATION_RESERVATION_HANDLE IMPLICIT NONE @@ -36,8 +36,9 @@ FUNCTION PREPARE_LTDIR(ALLOCATOR, KF_FS, KF_UV) RESULT(HLTDIR) INTEGER(KIND=JPIM), INTENT(IN) :: KF_FS, KF_UV TYPE(LTDIR_HANDLE) :: HLTDIR - INTEGER(KIND=C_SIZE_T) :: IALLOC_SZ - INTEGER(KIND=JPIM) :: IOUT_STRIDES0, IOUT_SIZE + INTEGER(KIND=JPIB) :: IALLOC_SZ + INTEGER(KIND=JPIM) :: IOUT_STRIDES0 + INTEGER(KIND=JPIB) :: IOUT_SIZE INTEGER(KIND=JPIM) :: IOUT0_STRIDES0, IOUT0_SIZE REAL(KIND=JPRBT) :: ZPRBT_DUMMY @@ -47,15 +48,15 @@ FUNCTION PREPARE_LTDIR(ALLOCATOR, KF_FS, KF_UV) RESULT(HLTDIR) IOUT0_STRIDES0=IOUT0_STRIDES0,IOUT0_SIZE=IOUT0_SIZE) ! POA1 - IALLOC_SZ = ALIGN(INT(2*KF_FS*(R%NTMAX+3)*D%NUMP,KIND=C_SIZE_T)*C_SIZEOF(ZPRBT_DUMMY),128) + IALLOC_SZ = ALIGN(2_JPIB*KF_FS*(R%NTMAX+3)*D%NUMP*C_SIZEOF(ZPRBT_DUMMY),128) ! POA2 - IALLOC_SZ = IALLOC_SZ + ALIGN(INT(4*KF_UV*(R%NTMAX+3)*D%NUMP,KIND=C_SIZE_T)*C_SIZEOF(ZPRBT_DUMMY),128) + IALLOC_SZ = IALLOC_SZ + ALIGN(4_JPIB*KF_UV*(R%NTMAX+3)*D%NUMP*C_SIZEOF(ZPRBT_DUMMY),128) ! ZOUT - IALLOC_SZ = IALLOC_SZ + ALIGN(INT(IOUT_SIZE,KIND=C_SIZE_T)*C_SIZEOF(ZPRBT_DUMMY),128) + IALLOC_SZ = IALLOC_SZ + ALIGN(IOUT_SIZE*C_SIZEOF(ZPRBT_DUMMY),128) ! ZOUT0 - IALLOC_SZ = IALLOC_SZ+ ALIGN(INT(IOUT0_SIZE,KIND=C_SIZE_T)*C_SIZEOF(ZPRD_DUMMY),128) + IALLOC_SZ = IALLOC_SZ+ ALIGN(IOUT0_SIZE*C_SIZEOF(ZPRD_DUMMY),128) - HLTDIR%HOUT_AND_POA = RESERVE(ALLOCATOR, IALLOC_SZ) + HLTDIR%HOUT_AND_POA = RESERVE(ALLOCATOR, IALLOC_SZ, "HLTDIR%HOUT_AND_POA") END FUNCTION PREPARE_LTDIR SUBROUTINE LTDIR(ALLOCATOR,HLTDIR,ZINPS,ZINPA,ZINPS0,ZINPA0,KF_FS,KF_UV,KF_SCALARS,& @@ -162,8 +163,9 @@ SUBROUTINE LTDIR(ALLOCATOR,HLTDIR,ZINPS,ZINPA,ZINPS0,ZINPA0,KF_FS,KF_UV,KF_SCALA REAL(KIND=JPRD), POINTER :: ZOUT0(:) TYPE(BUFFERED_ALLOCATOR), INTENT(IN) :: ALLOCATOR TYPE(LTDIR_HANDLE), INTENT(IN) :: HLTDIR - INTEGER(KIND=C_SIZE_T) :: IALLOC_POS, IALLOC_SZ - INTEGER(KIND=JPIM) :: IOUT_STRIDES0, IOUT_SIZE + INTEGER(KIND=JPIB) :: IALLOC_POS, IALLOC_SZ + INTEGER(KIND=JPIM) :: IOUT_STRIDES0 + INTEGER(KIND=JPIB) :: IOUT_SIZE INTEGER(KIND=JPIM) :: IOUT0_STRIDES0, IOUT0_SIZE @@ -186,26 +188,26 @@ SUBROUTINE LTDIR(ALLOCATOR,HLTDIR,ZINPS,ZINPA,ZINPS0,ZINPA0,KF_FS,KF_UV,KF_SCALA IALLOC_POS = 1 - IALLOC_SZ = ALIGN(INT(2*KF_FS*(R%NTMAX+3)*D%NUMP,KIND=C_SIZE_T)*C_SIZEOF(POA1_L(1)),128) + IALLOC_SZ = ALIGN(2_JPIB*KF_FS*(R%NTMAX+3)*D%NUMP*C_SIZEOF(POA1_L(1)),128) CALL ASSIGN_PTR(POA1_L, GET_ALLOCATION(ALLOCATOR, HLTDIR%HOUT_AND_POA),& & IALLOC_POS, IALLOC_SZ, SET_STREAM=1) CALL C_F_POINTER(C_LOC(POA1_L), POA1, (/ 2*KF_FS, R%NTMAX+3, D%NUMP /)) IALLOC_POS = IALLOC_POS + IALLOC_SZ - IALLOC_SZ = ALIGN(INT(4*KF_UV*(R%NTMAX+3)*D%NUMP,KIND=C_SIZE_T)*C_SIZEOF(POA2_L(1)),128) + IALLOC_SZ = ALIGN(4_JPIB*KF_UV*(R%NTMAX+3)*D%NUMP*C_SIZEOF(POA2_L(1)),128) CALL ASSIGN_PTR(POA2_L, GET_ALLOCATION(ALLOCATOR, HLTDIR%HOUT_AND_POA),& & IALLOC_POS, IALLOC_SZ, SET_STREAM=1) CALL C_F_POINTER(C_LOC(POA2_L), POA2, (/ 4*KF_UV, R%NTMAX+3, D%NUMP /)) IALLOC_POS = IALLOC_POS + IALLOC_SZ ! ZOUT - IALLOC_SZ = ALIGN(INT(IOUT_SIZE,C_SIZE_T)*C_SIZEOF(ZOUT(1)),128) + IALLOC_SZ = ALIGN(IOUT_SIZE*C_SIZEOF(ZOUT(1)),128) CALL ASSIGN_PTR(ZOUT, GET_ALLOCATION(ALLOCATOR, HLTDIR%HOUT_AND_POA),& & IALLOC_POS, IALLOC_SZ, SET_STREAM=1) IALLOC_POS = IALLOC_POS + IALLOC_SZ ! ZOUT0 - IALLOC_SZ = ALIGN(INT(IOUT0_SIZE,C_SIZE_T)*C_SIZEOF(ZOUT0(1)),128) + IALLOC_SZ = ALIGN(IOUT0_SIZE*C_SIZEOF(ZOUT0(1)),128) CALL ASSIGN_PTR(ZOUT0, GET_ALLOCATION(ALLOCATOR, HLTDIR%HOUT_AND_POA),& & IALLOC_POS, IALLOC_SZ, SET_STREAM=1) IALLOC_POS = IALLOC_POS + IALLOC_SZ diff --git a/src/trans/gpu/internal/ltinv_mod.F90 b/src/trans/gpu/internal/ltinv_mod.F90 index 11ed079a5..909d138b3 100755 --- a/src/trans/gpu/internal/ltinv_mod.F90 +++ b/src/trans/gpu/internal/ltinv_mod.F90 @@ -25,7 +25,7 @@ MODULE LTINV_MOD CONTAINS FUNCTION PREPARE_LTINV(ALLOCATOR,KF_UV,KF_SCALARS,LVORGP,LDIVGP,LSCDERS) RESULT(HLTINV) - USE PARKIND_ECTRANS, ONLY: JPIM, JPRBT, JPRD + USE PARKIND_ECTRANS, ONLY: JPIM, JPRBT, JPRD, JPIB USE TPM_DISTR, ONLY: D USE TPM_DIM, ONLY: R USE ISO_C_BINDING, ONLY: C_SIZE_T, C_SIZEOF @@ -40,10 +40,12 @@ FUNCTION PREPARE_LTINV(ALLOCATOR,KF_UV,KF_SCALARS,LVORGP,LDIVGP,LSCDERS) RESULT( TYPE(LTINV_HANDLE) :: HLTINV - INTEGER(KIND=C_SIZE_T) :: IALLOC_SZ, IPIA_SZ - INTEGER(KIND=JPIM) :: IOUT_STRIDES0, IOUT_SIZE + INTEGER(KIND=JPIB) :: IALLOC_SZ, IPIA_SZ + INTEGER(KIND=JPIM) :: IOUT_STRIDES0 + INTEGER(KIND=JPIB) :: IOUT_SIZE INTEGER(KIND=JPIM) :: IOUT0_STRIDES0, IOUT0_SIZE - INTEGER(KIND=JPIM) :: IIN_STRIDES0, IIN_SIZE + INTEGER(KIND=JPIM) :: IIN_STRIDES0 + INTEGER(KIND=JPIB) :: IIN_SIZE INTEGER(KIND=JPIM) :: IIN0_STRIDES0, IIN0_SIZE REAL(KIND=JPRBT) :: ZPRBT_DUMMY @@ -63,7 +65,7 @@ FUNCTION PREPARE_LTINV(ALLOCATOR,KF_UV,KF_SCALARS,LVORGP,LDIVGP,LSCDERS) RESULT( IF (LSCDERS) & IF_READIN = IF_READIN + KF_SCALARS ! Scalars NS Derivatives - IPIA_SZ = ALIGN(INT(2*IF_READIN*(R%NSMAX+3)*D%NUMP,KIND=C_SIZE_T)*C_SIZEOF(ZPRBT_DUMMY),128) + IPIA_SZ = ALIGN(2_JPIB*IF_READIN*(R%NSMAX+3)*D%NUMP*C_SIZEOF(ZPRBT_DUMMY),128) ! In Legendre space, we then ignore vorticity/divergence, if ! they don't need to be transformed. @@ -77,23 +79,23 @@ FUNCTION PREPARE_LTINV(ALLOCATOR,KF_UV,KF_SCALARS,LVORGP,LDIVGP,LSCDERS) RESULT( ! PIA IALLOC_SZ = IPIA_SZ ! ZINP - IALLOC_SZ = IALLOC_SZ + ALIGN(INT(IIN_SIZE,KIND=C_SIZE_T)*C_SIZEOF(ZPRBT_DUMMY),128) + IALLOC_SZ = IALLOC_SZ + ALIGN(IIN_SIZE*C_SIZEOF(ZPRBT_DUMMY),128) ! ZINP0 - IALLOC_SZ = IALLOC_SZ + ALIGN(INT(IIN0_SIZE,KIND=C_SIZE_T)*C_SIZEOF(ZPRD_DUMMY),128) + IALLOC_SZ = IALLOC_SZ + ALIGN(IIN0_SIZE*C_SIZEOF(ZPRD_DUMMY),128) - HLTINV%HPIA_AND_IN = RESERVE(ALLOCATOR, IALLOC_SZ) + HLTINV%HPIA_AND_IN = RESERVE(ALLOCATOR, IALLOC_SZ, "HLTINV_HPIA_AND_IN") IALLOC_SZ = 0 ! ZOUTA - IALLOC_SZ = IALLOC_SZ + ALIGN(INT(IOUT_SIZE,KIND=C_SIZE_T)*C_SIZEOF(ZPRBT_DUMMY),128) + IALLOC_SZ = IALLOC_SZ + ALIGN(IOUT_SIZE*C_SIZEOF(ZPRBT_DUMMY),128) ! ZOUTS - IALLOC_SZ = IALLOC_SZ + ALIGN(INT(IOUT_SIZE,KIND=C_SIZE_T)*C_SIZEOF(ZPRBT_DUMMY),128) + IALLOC_SZ = IALLOC_SZ + ALIGN(IOUT_SIZE*C_SIZEOF(ZPRBT_DUMMY),128) ! ZOUTA0 - IALLOC_SZ = IALLOC_SZ + ALIGN(INT(IOUT0_SIZE,KIND=C_SIZE_T)*C_SIZEOF(ZPRD_DUMMY),128) + IALLOC_SZ = IALLOC_SZ + ALIGN(IOUT0_SIZE*C_SIZEOF(ZPRD_DUMMY),128) ! ZOUTS0 - IALLOC_SZ = IALLOC_SZ + ALIGN(INT(IOUT0_SIZE,KIND=C_SIZE_T)*C_SIZEOF(ZPRD_DUMMY),128) + IALLOC_SZ = IALLOC_SZ + ALIGN(IOUT0_SIZE*C_SIZEOF(ZPRD_DUMMY),128) - HLTINV%HOUTS_AND_OUTA = RESERVE(ALLOCATOR, IALLOC_SZ) + HLTINV%HOUTS_AND_OUTA = RESERVE(ALLOCATOR, IALLOC_SZ, "HLTINV_HOUTS_AND_OUTA") END FUNCTION PREPARE_LTINV @@ -101,7 +103,7 @@ SUBROUTINE LTINV(ALLOCATOR,HLTINV,KF_UV,KF_SCALARS,& & PSPVOR,PSPDIV,PSPSCALAR,PSPSC3A,PSPSC3B,PSPSC2, & & ZOUTS,ZOUTA,ZOUTS0,ZOUTA0) - USE PARKIND_ECTRANS, ONLY: JPIM, JPRB, JPRBT, JPRD + USE PARKIND_ECTRANS, ONLY: JPIM, JPRB, JPRBT, JPRD, JPIB USE YOMHOOK, ONLY: LHOOK, DR_HOOK, JPHOOK USE TPM_DIM, ONLY: R USE TPM_TRANS, ONLY: LDIVGP, LVORGP, NF_SC2, NF_SC3A, NF_SC3B, LSCDERS @@ -193,13 +195,15 @@ SUBROUTINE LTINV(ALLOCATOR,HLTINV,KF_UV,KF_SCALARS,& TYPE(BUFFERED_ALLOCATOR), INTENT(IN) :: ALLOCATOR TYPE(LTINV_HANDLE), INTENT(IN) :: HLTINV - INTEGER(KIND=JPIM) :: IOUT_STRIDES0, IOUT_SIZE + INTEGER(KIND=JPIM) :: IOUT_STRIDES0 + INTEGER(KIND=JPIB) :: IOUT_SIZE INTEGER(KIND=JPIM) :: IOUT0_STRIDES0, IOUT0_SIZE - INTEGER(KIND=JPIM) :: IIN_STRIDES0, IIN_SIZE + INTEGER(KIND=JPIM) :: IIN_STRIDES0 + INTEGER(KIND=JPIB) :: IIN_SIZE INTEGER(KIND=JPIM) :: IIN0_STRIDES0, IIN0_SIZE INTEGER(KIND=JPIM) :: IF_READIN, IF_LEG - INTEGER(KIND=C_SIZE_T) :: IALLOC_POS, IALLOC_SZ + INTEGER(KIND=JPIB) :: IALLOC_POS, IALLOC_SZ REAL(KIND=JPRBT), POINTER :: ZINP(:) REAL(KIND=JPRD), POINTER :: ZINP0(:) @@ -233,20 +237,20 @@ SUBROUTINE LTINV(ALLOCATOR,HLTINV,KF_UV,KF_SCALARS,& IALLOC_POS = 1 ! PIA - IALLOC_SZ = ALIGN(INT(2*IF_READIN*(R%NTMAX+3)*D%NUMP,KIND=C_SIZE_T)*C_SIZEOF(PIA_L(1)),128) + IALLOC_SZ = ALIGN(2_JPIB*IF_READIN*(R%NTMAX+3)*D%NUMP*C_SIZEOF(PIA_L(1)),128) CALL ASSIGN_PTR(PIA_L, GET_ALLOCATION(ALLOCATOR, HLTINV%HPIA_AND_IN),& & IALLOC_POS, IALLOC_SZ) CALL C_F_POINTER(C_LOC(PIA_L), PIA, (/ 2*IF_READIN, R%NTMAX+3, D%NUMP /)) IALLOC_POS = IALLOC_POS + IALLOC_SZ ! ZINP - IALLOC_SZ = ALIGN(INT(IIN_SIZE,KIND=C_SIZE_T)*C_SIZEOF(ZINP(1)),128) + IALLOC_SZ = ALIGN(IIN_SIZE*C_SIZEOF(ZINP(1)),128) CALL ASSIGN_PTR(ZINP, GET_ALLOCATION(ALLOCATOR, HLTINV%HPIA_AND_IN),& & IALLOC_POS, IALLOC_SZ) IALLOC_POS = IALLOC_POS + IALLOC_SZ ! ZINP0 - IALLOC_SZ = ALIGN(INT(IIN0_SIZE,KIND=C_SIZE_T)*C_SIZEOF(ZINP0(1)),128) + IALLOC_SZ = ALIGN(IIN0_SIZE*C_SIZEOF(ZINP0(1)),128) CALL ASSIGN_PTR(ZINP0, GET_ALLOCATION(ALLOCATOR, HLTINV%HPIA_AND_IN),& & IALLOC_POS, IALLOC_SZ) IALLOC_POS = IALLOC_POS + IALLOC_SZ @@ -254,25 +258,25 @@ SUBROUTINE LTINV(ALLOCATOR,HLTINV,KF_UV,KF_SCALARS,& IALLOC_POS = 1 ! ZOUTA - IALLOC_SZ = ALIGN(INT(IOUT_SIZE,KIND=C_SIZE_T)*C_SIZEOF(ZOUTA(1)),128) + IALLOC_SZ = ALIGN(IOUT_SIZE*C_SIZEOF(ZOUTA(1)),128) CALL ASSIGN_PTR(ZOUTA, GET_ALLOCATION(ALLOCATOR, HLTINV%HOUTS_AND_OUTA),& & IALLOC_POS, IALLOC_SZ) IALLOC_POS = IALLOC_POS + IALLOC_SZ ! ZOUTS - IALLOC_SZ = ALIGN(INT(IOUT_SIZE,KIND=C_SIZE_T)*C_SIZEOF(ZOUTS(1)),128) + IALLOC_SZ = ALIGN(IOUT_SIZE*C_SIZEOF(ZOUTS(1)),128) CALL ASSIGN_PTR(ZOUTS, GET_ALLOCATION(ALLOCATOR, HLTINV%HOUTS_AND_OUTA),& & IALLOC_POS, IALLOC_SZ) IALLOC_POS = IALLOC_POS + IALLOC_SZ ! ZOUTA0 - IALLOC_SZ = ALIGN(INT(IOUT0_SIZE,KIND=C_SIZE_T)*C_SIZEOF(ZOUTA0(1)),128) + IALLOC_SZ = ALIGN(IOUT0_SIZE*C_SIZEOF(ZOUTA0(1)),128) CALL ASSIGN_PTR(ZOUTA0, GET_ALLOCATION(ALLOCATOR, HLTINV%HOUTS_AND_OUTA),& & IALLOC_POS, IALLOC_SZ) IALLOC_POS = IALLOC_POS + IALLOC_SZ ! ZOUTS0 - IALLOC_SZ = ALIGN(INT(IOUT0_SIZE,KIND=C_SIZE_T)*C_SIZEOF(ZOUTS0(1)),128) + IALLOC_SZ = ALIGN(IOUT0_SIZE*C_SIZEOF(ZOUTS0(1)),128) CALL ASSIGN_PTR(ZOUTS0, GET_ALLOCATION(ALLOCATOR, HLTINV%HOUTS_AND_OUTA),& & IALLOC_POS, IALLOC_SZ) IALLOC_POS = IALLOC_POS + IALLOC_SZ diff --git a/src/trans/gpu/internal/sump_trans_mod.F90 b/src/trans/gpu/internal/sump_trans_mod.F90 index a0f2260b4..5a1de3028 100755 --- a/src/trans/gpu/internal/sump_trans_mod.F90 +++ b/src/trans/gpu/internal/sump_trans_mod.F90 @@ -19,7 +19,7 @@ SUBROUTINE SUMP_TRANS ! Modifications : ! P.Marguinaud : 11-Sep-2012 : Fix twice allocated pointer -USE EC_PARKIND ,ONLY : JPIM ,JPRD +USE EC_PARKIND ,ONLY : JPIM ,JPRD, JPIB USE TPM_GEN ,ONLY : NOUT, NPRINTLEV USE TPM_DIM ,ONLY : R @@ -40,7 +40,8 @@ SUBROUTINE SUMP_TRANS INTEGER(KIND=JPIM) :: JM INTEGER(KIND=JPIM) :: JGL,IGL,IPLAT,ISENDSET,IRECVSET,JML,IPOS,IM -INTEGER(KIND=JPIM) :: IGPTOT,IMEDIAP,IRESTM,JA,JB,IOFF,OFFSET1,OFFSET2,KMLOC,KM +INTEGER(KIND=JPIM) :: IGPTOT,IMEDIAP,IRESTM,JA,JB,IOFF,KMLOC,KM +INTEGER(KIND=JPIB) :: OFFSET1,OFFSET2 INTEGER(KIND=JPIM),ALLOCATABLE :: IGPTOTL(:,:) REAL(KIND=JPRD),ALLOCATABLE :: ZDUM(:) diff --git a/src/trans/gpu/internal/tpm_hicfft.F90 b/src/trans/gpu/internal/tpm_hicfft.F90 index 9ae63df00..bee163eee 100755 --- a/src/trans/gpu/internal/tpm_hicfft.F90 +++ b/src/trans/gpu/internal/tpm_hicfft.F90 @@ -19,7 +19,7 @@ MODULE TPM_HICFFT ! Original October 2014 ! HICFFT abstraction for CUDA and HIP August 2023 B. Reuter - USE, INTRINSIC :: ISO_C_BINDING, ONLY: C_INT, C_PTR, C_LOC, C_FLOAT, C_DOUBLE + USE, INTRINSIC :: ISO_C_BINDING, ONLY: C_FLOAT, C_DOUBLE, C_LOC USE GROWING_ALLOCATOR_MOD, ONLY: GROWING_ALLOCATION_TYPE IMPLICIT NONE @@ -45,23 +45,25 @@ MODULE TPM_HICFFT SUBROUTINE EXECUTE_DIR_FFT_FLOAT(PREEL_REAL,PREEL_COMPLEX,KFIELD,LOENS,OFFSETS,ALLOC) - USE EC_PARKIND ,ONLY : JPIM + USE EC_PARKIND ,ONLY : JPIM, JPIB IMPLICIT NONE REAL(KIND=C_FLOAT), INTENT(IN) :: PREEL_REAL(:) REAL(KIND=C_FLOAT), INTENT(OUT) :: PREEL_COMPLEX(:) INTEGER(KIND=JPIM),INTENT(IN) :: KFIELD - INTEGER(KIND=JPIM),INTENT(IN) :: LOENS(:), OFFSETS(:) + INTEGER(KIND=JPIM),INTENT(IN) :: LOENS(:) + INTEGER(KIND=JPIB),INTENT(IN) :: OFFSETS(:) TYPE(GROWING_ALLOCATION_TYPE), INTENT(IN) :: ALLOC INTERFACE SUBROUTINE EXECUTE_DIR_FFT_FLOAT_C(PREEL_REAL,PREEL_COMPLEX,KFIELD,LOENS,OFFSETS,NFFT,ALLOC) & & BIND(C, NAME="execute_dir_fft_float") - USE ISO_C_BINDING + USE ISO_C_BINDING, ONLY: C_FLOAT, C_INT, C_PTR, C_INT64_T REAL(KIND=C_FLOAT), INTENT(IN) :: PREEL_REAL(*) REAL(KIND=C_FLOAT), INTENT(OUT) :: PREEL_COMPLEX(*) INTEGER(KIND=C_INT),INTENT(IN),VALUE :: KFIELD - INTEGER(KIND=C_INT),INTENT(IN) :: LOENS(*), OFFSETS(*) + INTEGER(KIND=C_INT),INTENT(IN) :: LOENS(*) + INTEGER(KIND=C_INT64_T),INTENT(IN) :: OFFSETS(*) INTEGER(KIND=C_INT),INTENT(IN),VALUE :: NFFT TYPE(C_PTR), INTENT(IN), VALUE :: ALLOC END SUBROUTINE @@ -77,23 +79,25 @@ SUBROUTINE EXECUTE_DIR_FFT_FLOAT_C(PREEL_REAL,PREEL_COMPLEX,KFIELD,LOENS,OFFSETS END SUBROUTINE EXECUTE_DIR_FFT_FLOAT SUBROUTINE EXECUTE_DIR_FFT_DOUBLE(PREEL_REAL,PREEL_COMPLEX,KFIELD,LOENS,OFFSETS,ALLOC) - USE EC_PARKIND ,ONLY : JPIM + USE EC_PARKIND ,ONLY : JPIM, JPIB IMPLICIT NONE REAL(KIND=C_DOUBLE), INTENT(IN) :: PREEL_REAL(:) REAL(KIND=C_DOUBLE), INTENT(OUT) :: PREEL_COMPLEX(:) INTEGER(KIND=JPIM),INTENT(IN) :: KFIELD - INTEGER(KIND=JPIM),INTENT(IN) :: LOENS(:), OFFSETS(:) + INTEGER(KIND=JPIM),INTENT(IN) :: LOENS(:) + INTEGER(KIND=JPIB),INTENT(IN) :: OFFSETS(:) TYPE(GROWING_ALLOCATION_TYPE), INTENT(IN) :: ALLOC INTERFACE SUBROUTINE EXECUTE_DIR_FFT_DOUBLE_C(PREEL_REAL,PREEL_COMPLEX,KFIELD,LOENS,OFFSETS,NFFT,ALLOC) & & BIND(C, NAME="execute_dir_fft_double") - USE ISO_C_BINDING, ONLY: C_DOUBLE, C_INT, C_PTR + USE ISO_C_BINDING, ONLY: C_DOUBLE, C_INT, C_PTR, C_INT64_T REAL(KIND=C_DOUBLE), INTENT(IN) :: PREEL_REAL(*) REAL(KIND=C_DOUBLE), INTENT(OUT) :: PREEL_COMPLEX(*) INTEGER(KIND=C_INT),INTENT(IN),VALUE :: KFIELD - INTEGER(KIND=C_INT),INTENT(IN) :: LOENS(*), OFFSETS(*) + INTEGER(KIND=C_INT),INTENT(IN) :: LOENS(*) + INTEGER(KIND=C_INT64_T),INTENT(IN) :: OFFSETS(*) INTEGER(KIND=C_INT),INTENT(IN),VALUE :: NFFT TYPE(C_PTR), INTENT(IN), VALUE :: ALLOC END SUBROUTINE @@ -110,23 +114,25 @@ SUBROUTINE EXECUTE_DIR_FFT_DOUBLE_C(PREEL_REAL,PREEL_COMPLEX,KFIELD,LOENS,OFFSET END SUBROUTINE EXECUTE_DIR_FFT_DOUBLE SUBROUTINE EXECUTE_INV_FFT_FLOAT(PREEL_COMPLEX,PREEL_REAL,KFIELD,LOENS,OFFSETS,ALLOC) - USE EC_PARKIND ,ONLY : JPIM + USE EC_PARKIND ,ONLY : JPIM, JPIB IMPLICIT NONE REAL(KIND=C_FLOAT), INTENT(IN) :: PREEL_COMPLEX(:) REAL(KIND=C_FLOAT), INTENT(OUT) :: PREEL_REAL(:) INTEGER(KIND=JPIM),INTENT(IN) :: KFIELD - INTEGER(KIND=JPIM),INTENT(IN) :: LOENS(:), OFFSETS(:) + INTEGER(KIND=JPIM),INTENT(IN) :: LOENS(:) + INTEGER(KIND=JPIB),INTENT(IN) :: OFFSETS(:) TYPE(GROWING_ALLOCATION_TYPE), INTENT(IN) :: ALLOC INTERFACE SUBROUTINE EXECUTE_INV_FFT_FLOAT_C(PREEL_COMPLEX,PREEL_REAL,KFIELD,LOENS,OFFSETS,NFFT,ALLOC) & & BIND(C, NAME="execute_inv_fft_float") - USE ISO_C_BINDING, ONLY: C_FLOAT, C_INT, C_PTR + USE ISO_C_BINDING, ONLY: C_FLOAT, C_INT, C_PTR, C_INT64_T REAL(KIND=C_FLOAT), INTENT(IN) :: PREEL_COMPLEX(*) REAL(KIND=C_FLOAT), INTENT(OUT) :: PREEL_REAL(*) INTEGER(KIND=C_INT),INTENT(IN),VALUE :: KFIELD - INTEGER(KIND=C_INT),INTENT(IN) :: LOENS(*), OFFSETS(*) + INTEGER(KIND=C_INT),INTENT(IN) :: LOENS(*) + INTEGER(KIND=C_INT64_T),INTENT(IN) :: OFFSETS(*) INTEGER(KIND=C_INT),INTENT(IN),VALUE :: NFFT TYPE(C_PTR), INTENT(IN), VALUE :: ALLOC END SUBROUTINE @@ -142,23 +148,25 @@ SUBROUTINE EXECUTE_INV_FFT_FLOAT_C(PREEL_COMPLEX,PREEL_REAL,KFIELD,LOENS,OFFSETS END SUBROUTINE SUBROUTINE EXECUTE_INV_FFT_DOUBLE(PREEL_COMPLEX,PREEL_REAL,KFIELD,LOENS,OFFSETS,ALLOC) - USE EC_PARKIND ,ONLY : JPIM + USE EC_PARKIND ,ONLY : JPIM, JPIB IMPLICIT NONE REAL(KIND=C_DOUBLE), INTENT(IN) :: PREEL_COMPLEX(:) REAL(KIND=C_DOUBLE), INTENT(OUT) :: PREEL_REAL(:) INTEGER(KIND=JPIM),INTENT(IN) :: KFIELD - INTEGER(KIND=JPIM),INTENT(IN) :: LOENS(:), OFFSETS(:) + INTEGER(KIND=JPIM),INTENT(IN) :: LOENS(:) + INTEGER(KIND=JPIB),INTENT(IN) :: OFFSETS(:) TYPE(GROWING_ALLOCATION_TYPE), INTENT(IN) :: ALLOC INTERFACE SUBROUTINE EXECUTE_INV_FFT_DOUBLE_C(PREEL_COMPLEX,PREEL_REAL,KFIELD,LOENS,OFFSETS,NFFT,ALLOC) & & BIND(C, NAME="execute_inv_fft_double") - USE ISO_C_BINDING, ONLY: C_DOUBLE, C_INT, C_PTR + USE ISO_C_BINDING, ONLY: C_DOUBLE, C_INT, C_PTR, C_INT64_T REAL(KIND=C_DOUBLE), INTENT(IN) :: PREEL_COMPLEX(*) REAL(KIND=C_DOUBLE), INTENT(OUT) :: PREEL_REAL(*) INTEGER(KIND=C_INT),INTENT(IN),VALUE :: KFIELD - INTEGER(KIND=C_INT),INTENT(IN) :: LOENS(*), OFFSETS(*) + INTEGER(KIND=C_INT),INTENT(IN) :: LOENS(*) + INTEGER(KIND=C_INT64_T),INTENT(IN) :: OFFSETS(*) INTEGER(KIND=C_INT),INTENT(IN),VALUE :: NFFT TYPE(C_PTR), INTENT(IN), VALUE :: ALLOC END SUBROUTINE diff --git a/src/trans/gpu/internal/trgtol_mod.F90 b/src/trans/gpu/internal/trgtol_mod.F90 index d3580ad3d..310625e26 100755 --- a/src/trans/gpu/internal/trgtol_mod.F90 +++ b/src/trans/gpu/internal/trgtol_mod.F90 @@ -1,6 +1,5 @@ #define ALIGN(I, A) (((I)+(A)-1)/(A)*(A)) -! (C) Copyright 1995- ECMWF. -! (C) Copyright 1995- Meteo-France. +! (C) Copyright 1995- ECMWF.,KMLOC,KM ! (C) Copyright 1995- Meteo-France. ! (C) Copyright 2022- NVIDIA. ! ! This software is licensed under the terms of the Apache Licence Version 2.0 @@ -22,7 +21,7 @@ MODULE TRGTOL_MOD END TYPE CONTAINS FUNCTION PREPARE_TRGTOL(ALLOCATOR,KF_GP,KF_FS) RESULT(HTRGTOL) - USE PARKIND_ECTRANS, ONLY: JPIM, JPRB, JPRBT + USE PARKIND_ECTRANS, ONLY: JPIM, JPRB, JPRBT, JPIB USE TPM_DISTR, ONLY: D USE BUFFERED_ALLOCATOR_MOD, ONLY: BUFFERED_ALLOCATOR, RESERVE USE ISO_C_BINDING, ONLY: C_SIZE_T, C_SIZEOF @@ -35,13 +34,14 @@ FUNCTION PREPARE_TRGTOL(ALLOCATOR,KF_GP,KF_FS) RESULT(HTRGTOL) REAL(KIND=JPRBT) :: DUMMY - INTEGER(KIND=C_SIZE_T) :: NELEM + INTEGER(KIND=JPIB) :: NELEM - HTRGTOL%HCOMBUFS = RESERVE(ALLOCATOR, INT(KF_GP*D%NGPTOT,KIND=C_SIZE_T)*C_SIZEOF(DUMMY)) + HTRGTOL%HCOMBUFS = RESERVE(ALLOCATOR, 1_JPIB*KF_GP*D%NGPTOT*C_SIZEOF(DUMMY), "HTRGTOL%HCOMBUFS") - NELEM = INT(KF_FS*D%NLENGTF,KIND=C_SIZE_T)*C_SIZEOF(DUMMY) ! ZCOMBUFR - NELEM = NELEM + INT(KF_FS*D%NLENGTF,KIND=C_SIZE_T)*C_SIZEOF(DUMMY) ! PREEL_REAL - HTRGTOL%HCOMBUFR_AND_REEL = RESERVE(ALLOCATOR, NELEM) + NELEM = 0 + NELEM = NELEM + 1_JPIB*KF_FS*D%NLENGTF*C_SIZEOF(DUMMY) ! ZCOMBUFR + NELEM = NELEM + 1_JPIB*KF_FS*D%NLENGTF*C_SIZEOF(DUMMY) ! PREEL_REAL + HTRGTOL%HCOMBUFR_AND_REEL = RESERVE(ALLOCATOR, NELEM, "HTRGTOL%HCOMBUFR_AND_REEL") END FUNCTION PREPARE_TRGTOL SUBROUTINE TRGTOL(ALLOCATOR,HTRGTOL,PREEL_REAL,KF_FS,KF_GP,KF_UV_G,KF_SCALARS_G,& @@ -103,9 +103,9 @@ SUBROUTINE TRGTOL(ALLOCATOR,HTRGTOL,PREEL_REAL,KF_FS,KF_GP,KF_UV_G,KF_SCALARS_G, ! 09-01-02 G.Mozdzynski: use non-blocking recv and send ! ------------------------------------------------------------------ - USE PARKIND_ECTRANS, ONLY: JPIM, JPRB, JPRBT + USE PARKIND_ECTRANS, ONLY: JPIM, JPRB, JPRBT, JPIB USE YOMHOOK, ONLY: LHOOK, DR_HOOK, JPHOOK - USE MPL_MODULE, ONLY: MPL_WAIT, MPL_BARRIER + USE MPL_MODULE, ONLY: MPL_WAIT, MPL_BARRIER, MPL_ABORT USE TPM_GEN, ONLY: LSYNC_TRANS USE EQ_REGIONS_MOD, ONLY: MY_REGION_EW, MY_REGION_NS USE TPM_DISTR, ONLY: D, MYSETV, MYSETW, MTAGLG, NPRCIDS, MYPROC, NPROC, NPRTRW, & @@ -142,28 +142,33 @@ SUBROUTINE TRGTOL(ALLOCATOR,HTRGTOL,PREEL_REAL,KF_FS,KF_GP,KF_UV_G,KF_SCALARS_G, ! LOCAL INTEGER SCALARS REAL(KIND=JPRBT), POINTER :: ZCOMBUFS(:),ZCOMBUFR(:) - INTEGER(KIND=JPIM) :: ISENDTOT (NPROC) - INTEGER(KIND=JPIM) :: IRECVTOT (NPROC) + LOGICAL :: LLOCAL_CONTRIBUTION + INTEGER(KIND=JPIB) :: ISENDTOT (NPROC) + INTEGER(KIND=JPIB) :: IRECVTOT (NPROC) + INTEGER(KIND=JPIM) :: ISENDTOT_MPI(NPROC) + INTEGER(KIND=JPIM) :: IRECVTOT_MPI(NPROC) INTEGER(KIND=JPIM) :: IREQ (NPROC*2) INTEGER(KIND=JPIM) :: IRECV_TO_PROC(NPROC) INTEGER(KIND=JPIM) :: ISEND_TO_PROC(NPROC) INTEGER(KIND=JPIM) :: IFIRSTLAT, IGL, IGLL, ILAST,& - &ILASTLAT, ILEN, JROC, IPOS, ISETA, & + &ILASTLAT, ILEN, JROC, ISETA, & &ISETB, IRECV, & &ISETV, ISEND, JBLK, JFLD, & &JGL, JI, JK, JL, ISETW, IFLD, & &II,IBUFLENR,IRECV_COUNTS, IPROC,IFLDS, & &ISEND_COUNTS,INS,INR,IR, JKL, PBOUND, IERROR, ILOCAL_LAT INTEGER(KIND=JPIM) :: KF, KGL, KI, J3 + INTEGER(KIND=JPIB) :: IPOS INTEGER(KIND=JPIM) :: IOFF, ILAT_STRIP - INTEGER(KIND=JPIM) :: IRECV_BUFR_TO_OUT(D%NLENGTF,2),IRECV_BUFR_TO_OUT_OFFSET(NPROC), IRECV_BUFR_TO_OUT_V + INTEGER(KIND=JPIB) :: IRECV_BUFR_TO_OUT(D%NLENGTF,2) + INTEGER(KIND=JPIB) :: IRECV_BUFR_TO_OUT_OFFSET(NPROC), IRECV_BUFR_TO_OUT_V INTEGER(KIND=JPIM) :: ISEND_FIELD_COUNT(NPRTRV),ISEND_FIELD_COUNT_V INTEGER(KIND=JPIM) :: ISEND_WSET_SIZE(NPRTRW),ISEND_WSET_SIZE_V INTEGER(KIND=JPIM) :: ISEND_WSET_OFFSET(NPRTRW+1), ISEND_WSET_OFFSET_V - INTEGER(KIND=JPIM), ALLOCATABLE :: ICOMBUFS_OFFSET(:),ICOMBUFR_OFFSET(:) - INTEGER(KIND=JPIM) :: ICOMBUFS_OFFSET_V, ICOMBUFR_OFFSET_V + INTEGER(KIND=JPIB), ALLOCATABLE :: ICOMBUFS_OFFSET(:),ICOMBUFR_OFFSET(:) + INTEGER(KIND=JPIB) :: ICOMBUFS_OFFSET_V, ICOMBUFR_OFFSET_V INTEGER(KIND=JPIM) :: IFLDA(KF_GP) INTEGER(KIND=JPIM) :: IVSET(KF_GP) @@ -281,8 +286,9 @@ SUBROUTINE TRGTOL(ALLOCATOR,HTRGTOL,PREEL_REAL,KF_FS,KF_GP,KF_UV_G,KF_SCALARS_G, DO JROC=1,NPROC CALL PE2SET(JROC,ISETA,ISETB,ISETW,ISETV) ! total send size is # points per field * # fields - ISENDTOT(JROC) = ISEND_WSET_SIZE(ISETW)*ISEND_FIELD_COUNT(ISETV) + ISENDTOT(JROC) = 1_JPIB*ISEND_WSET_SIZE(ISETW)*ISEND_FIELD_COUNT(ISETV) ENDDO + LLOCAL_CONTRIBUTION = ISENDTOT(MYPROC) > 0 ! Prepare receiver arrays IRECV_BUFR_TO_OUT_OFFSET(:) = 0 @@ -325,8 +331,7 @@ SUBROUTINE TRGTOL(ALLOCATOR,HTRGTOL,PREEL_REAL,KF_FS,KF_GP,KF_UV_G,KF_SCALARS_G, block CALL ASSIGN_PTR(PREEL_REAL, GET_ALLOCATION(ALLOCATOR, HTRGTOL%HCOMBUFR_AND_REEL),& - & INT(KF_FS*D%NLENGTF,KIND=C_SIZE_T)*C_SIZEOF(PREEL_REAL(1))+1_C_SIZE_T, & - & INT(KF_FS*D%NLENGTF,KIND=C_SIZE_T)*C_SIZEOF(PREEL_REAL(1))) + & 1_JPIB*KF_FS*D%NLENGTF*C_SIZEOF(PREEL_REAL(1))+1, 1_JPIB*KF_FS*D%NLENGTF*C_SIZEOF(PREEL_REAL(1))) !!CALL ASSIGN_PTR(PREEL_REAL, GET_ALLOCATION(ALLOCATOR, HTRGTOL%HCOMBUFR_AND_REEL), size1, size2) end block @@ -457,7 +462,7 @@ SUBROUTINE TRGTOL(ALLOCATOR,HTRGTOL,PREEL_REAL,KF_FS,KF_GP,KF_UV_G,KF_SCALARS_G, IF (ISEND_COUNTS > 0) THEN CALL ASSIGN_PTR(ZCOMBUFS, GET_ALLOCATION(ALLOCATOR, HTRGTOL%HCOMBUFS),& - & 1_C_SIZE_T, INT(ICOMBUFS_OFFSET(ISEND_COUNTS+1),KIND=C_SIZE_T)*C_SIZEOF(ZCOMBUFS(1))) + & 1_JPIB, ICOMBUFS_OFFSET(ISEND_COUNTS+1)*C_SIZEOF(ZCOMBUFS(1))) ENDIF !....Pack loop......................................................... @@ -566,7 +571,7 @@ SUBROUTINE TRGTOL(ALLOCATOR,HTRGTOL,PREEL_REAL,KF_FS,KF_GP,KF_UV_G,KF_SCALARS_G, CALL GSTATS(411,0) IF (IRECV_COUNTS > 0) THEN CALL ASSIGN_PTR(ZCOMBUFR, GET_ALLOCATION(ALLOCATOR, HTRGTOL%HCOMBUFR_AND_REEL),& - & 1_C_SIZE_T, INT(ICOMBUFR_OFFSET(IRECV_COUNTS+1),KIND=C_SIZE_T)*C_SIZEOF(ZCOMBUFR(1))) + & 1_JPIB, ICOMBUFR_OFFSET(IRECV_COUNTS+1)*C_SIZEOF(ZCOMBUFR(1))) ENDIF #ifdef OMPGPU #endif @@ -586,12 +591,24 @@ SUBROUTINE TRGTOL(ALLOCATOR,HTRGTOL,PREEL_REAL,KF_FS,KF_GP,KF_UV_G,KF_SCALARS_G, !! this is safe-but-slow fallback for running without GPU-aware MPI !$ACC UPDATE HOST(ZCOMBUFS) IF(ISEND_COUNTS > 0) #endif + + ! Skip the own contribution because this is ok to overflow + ISENDTOT(MYPROC) = 0 + IRECVTOT(MYPROC) = 0 + + ISENDTOT_MPI = ISENDTOT + IRECVTOT_MPI = IRECVTOT + IF (ANY(ISENDTOT_MPI /= ISENDTOT)) & + & CALL MPL_ABORT("Overflow in trgtol") + IF (ANY(IRECVTOT_MPI /= IRECVTOT)) & + & CALL MPL_ABORT("Overflow in trgtol") + ! Receive loop......................................................... DO INR=1,IRECV_COUNTS IR=IR+1 IPROC=IRECV_TO_PROC(INR) #if ECTRANS_HAVE_MPI - CALL MPI_IRECV(ZCOMBUFR(ICOMBUFR_OFFSET(INR)+1:ICOMBUFR_OFFSET(INR+1)),IRECVTOT(IPROC), & + CALL MPI_IRECV(ZCOMBUFR(ICOMBUFR_OFFSET(INR)+1:ICOMBUFR_OFFSET(INR+1)),IRECVTOT_MPI(IPROC), & & TRGTOL_DTYPE,NPRCIDS(IPROC)-1,MTAGLG,LOCAL_COMM,IREQUEST(IR),IERROR) IREQ(IR) = IREQUEST(IR)%MPI_VAL #else @@ -604,7 +621,7 @@ SUBROUTINE TRGTOL(ALLOCATOR,HTRGTOL,PREEL_REAL,KF_FS,KF_GP,KF_UV_G,KF_SCALARS_G, IR=IR+1 ISEND=ISEND_TO_PROC(INS) #if ECTRANS_HAVE_MPI - CALL MPI_ISEND(ZCOMBUFS(ICOMBUFS_OFFSET(INS)+1:ICOMBUFS_OFFSET(INS+1)),ISENDTOT(ISEND), & + CALL MPI_ISEND(ZCOMBUFS(ICOMBUFS_OFFSET(INS)+1:ICOMBUFS_OFFSET(INS+1)),ISENDTOT_MPI(ISEND), & & TRGTOL_DTYPE,NPRCIDS(ISEND)-1,MTAGLG,LOCAL_COMM,IREQUEST(IR),IERROR) IREQ(IR) = IREQUEST(IR)%MPI_VAL #else @@ -613,7 +630,7 @@ SUBROUTINE TRGTOL(ALLOCATOR,HTRGTOL,PREEL_REAL,KF_FS,KF_GP,KF_UV_G,KF_SCALARS_G, ENDDO ! Copy local contribution - IF(ISENDTOT(MYPROC) > 0 )THEN + IF(LLOCAL_CONTRIBUTION)THEN ! I have to send something to myself... ! Input is KF_GP fields. We find the resulting KF_FS fields. @@ -698,7 +715,6 @@ SUBROUTINE TRGTOL(ALLOCATOR,HTRGTOL,PREEL_REAL,KF_FS,KF_GP,KF_UV_G,KF_SCALARS_G, #ifdef ACCGPU !$ACC END DATA #endif - ENDIF diff --git a/src/trans/gpu/internal/trltog_mod.F90 b/src/trans/gpu/internal/trltog_mod.F90 index cc5abe34f..782623aa8 100755 --- a/src/trans/gpu/internal/trltog_mod.F90 +++ b/src/trans/gpu/internal/trltog_mod.F90 @@ -22,7 +22,7 @@ MODULE TRLTOG_MOD END TYPE CONTAINS FUNCTION PREPARE_TRLTOG(ALLOCATOR,KF_FS,KF_GP) RESULT(HTRLTOG) - USE PARKIND_ECTRANS, ONLY: JPIM, JPRBT + USE PARKIND_ECTRANS, ONLY: JPIM, JPRBT, JPIB USE TPM_DISTR, ONLY: D USE BUFFERED_ALLOCATOR_MOD, ONLY: BUFFERED_ALLOCATOR, RESERVE USE ISO_C_BINDING, ONLY: C_SIZE_T, C_SIZEOF @@ -35,12 +35,13 @@ FUNCTION PREPARE_TRLTOG(ALLOCATOR,KF_FS,KF_GP) RESULT(HTRLTOG) REAL(KIND=JPRBT) :: DUMMY - INTEGER(KIND=C_SIZE_T) :: NELEM + INTEGER(KIND=JPIB) :: NELEM - NELEM = ALIGN(INT(KF_GP*D%NGPTOT,KIND=C_SIZE_T)*C_SIZEOF(DUMMY),128) ! ZCOMBUFR - NELEM = ALIGN(NELEM + INT(KF_FS*D%NLENGTF,KIND=C_SIZE_T)*C_SIZEOF(DUMMY),128) !ZCOMBUFS upper bound + NELEM = 0 + NELEM = NELEM + ALIGN(1_JPIB*KF_GP*D%NGPTOT*C_SIZEOF(DUMMY),128) ! ZCOMBUFR + NELEM = NELEM + ALIGN(1_JPIB*KF_FS*D%NLENGTF*C_SIZEOF(DUMMY),128) !ZCOMBUFS upper obund - HTRLTOG%HCOMBUFR_AND_COMBUFS = RESERVE(ALLOCATOR, NELEM) + HTRLTOG%HCOMBUFR_AND_COMBUFS = RESERVE(ALLOCATOR, NELEM, "HTRLTOG%HCOMBUFR_AND_COMBUFS") END FUNCTION PREPARE_TRLTOG SUBROUTINE TRLTOG(ALLOCATOR,HTRLTOG,PREEL_REAL,KF_FS,KF_GP,KF_UV_G,KF_SCALARS_G,KPTRGP,& @@ -104,9 +105,9 @@ SUBROUTINE TRLTOG(ALLOCATOR,HTRLTOG,PREEL_REAL,KF_FS,KF_GP,KF_UV_G,KF_SCALARS_G, ! 09-01-02 G.Mozdzynski: use non-blocking recv and send ! ------------------------------------------------------------------ - USE PARKIND_ECTRANS, ONLY: JPIM, JPRB, JPRBT + USE PARKIND_ECTRANS, ONLY: JPIM, JPRB, JPRBT, JPIB USE YOMHOOK, ONLY: LHOOK, DR_HOOK, JPHOOK - USE MPL_MODULE, ONLY: MPL_WAIT, MPL_BARRIER + USE MPL_MODULE, ONLY: MPL_WAIT, MPL_BARRIER, MPL_ABORT USE TPM_GEN, ONLY: LSYNC_TRANS, NERR USE EQ_REGIONS_MOD, ONLY: MY_REGION_EW, MY_REGION_NS USE TPM_DISTR, ONLY: D,MYSETV, MYSETW, MTAGLG,NPRCIDS,MYPROC,NPROC,NPRTRW,NPRTRV @@ -155,17 +156,21 @@ SUBROUTINE TRLTOG(ALLOCATOR,HTRLTOG,PREEL_REAL,KF_FS,KF_GP,KF_UV_G,KF_SCALARS_G, REAL(KIND=JPRBT), POINTER :: ZCOMBUFS(:),ZCOMBUFR(:) - INTEGER(KIND=JPIM) :: ISENDTOT (NPROC) - INTEGER(KIND=JPIM) :: IRECVTOT (NPROC) + LOGICAL :: LLOCAL_CONTRIBUTION + INTEGER(KIND=JPIB) :: ISENDTOT (NPROC) + INTEGER(KIND=JPIB) :: IRECVTOT (NPROC) + INTEGER(KIND=JPIM) :: ISENDTOT_MPI(NPROC) + INTEGER(KIND=JPIM) :: IRECVTOT_MPI(NPROC) INTEGER(KIND=JPIM) :: IREQ (NPROC*2) INTEGER(KIND=JPIM) :: IRECV_TO_PROC(NPROC) INTEGER(KIND=JPIM) :: ISEND_TO_PROC(NPROC) INTEGER(KIND=JPIM) :: JFLD, J, JI, J1, J2, JGL, JK, JL, IFLDS, JROC, INR, INS INTEGER(KIND=JPIM) :: IFIRSTLAT, ILASTLAT, IFLD, IGL, IGLL,& - &IPOS, ISETA, ISETB, ISETV, ISEND, IRECV, ISETW, IPROC, & + &ISETA, ISETB, ISETV, ISEND, IRECV, ISETW, IPROC, & &IR, ILOCAL_LAT, ISEND_COUNTS, IRECV_COUNTS, IERROR, II, ILEN, IBUFLENS, IBUFLENR, & &JBLK, ILAT_STRIP + INTEGER(KIND=JPIB) :: IPOS ! Contains FIELD, PARS, LEVS INTEGER(KIND=JPIM) :: IGP_OFFSETS(KF_GP,3) @@ -173,12 +178,13 @@ SUBROUTINE TRLTOG(ALLOCATOR,HTRLTOG,PREEL_REAL,KF_FS,KF_GP,KF_UV_G,KF_SCALARS_G, INTEGER(KIND=JPIM) :: IUVPAR,IGP2PAR,IGP3ALEV,IGP3APAR,IGP3BLEV,IGP3BPAR,IPAROFF,IOFF INTEGER(KIND=JPIM) :: IFLDA(KF_GP) - INTEGER(KIND=JPIM) :: IIN_TO_SEND_BUFR(D%NLENGTF,2),IIN_TO_SEND_BUFR_OFFSET(NPROC), IIN_TO_SEND_BUFR_V + INTEGER(KIND=JPIB) :: IIN_TO_SEND_BUFR(D%NLENGTF,2) + INTEGER(KIND=JPIM) :: IIN_TO_SEND_BUFR_OFFSET(NPROC), IIN_TO_SEND_BUFR_V INTEGER(KIND=JPIM) :: IRECV_FIELD_COUNT(NPRTRV),IRECV_FIELD_COUNT_V INTEGER(KIND=JPIM) :: IRECV_WSET_SIZE(NPRTRW),IRECV_WSET_SIZE_V INTEGER(KIND=JPIM) :: IRECV_WSET_OFFSET(NPRTRW+1), IRECV_WSET_OFFSET_V - INTEGER(KIND=JPIM), ALLOCATABLE :: ICOMBUFS_OFFSET(:),ICOMBUFR_OFFSET(:) - INTEGER(KIND=JPIM) :: ICOMBUFS_OFFSET_V, ICOMBUFR_OFFSET_V + INTEGER(KIND=JPIB), ALLOCATABLE :: ICOMBUFS_OFFSET(:),ICOMBUFR_OFFSET(:) + INTEGER(KIND=JPIB) :: ICOMBUFS_OFFSET_V, ICOMBUFR_OFFSET_V INTEGER(KIND=JPIM) :: IVSETUV(KF_UV_G) INTEGER(KIND=JPIM) :: IVSETSC(KF_SCALARS_G) @@ -444,7 +450,7 @@ SUBROUTINE TRLTOG(ALLOCATOR,HTRLTOG,PREEL_REAL,KF_FS,KF_GP,KF_UV_G,KF_SCALARS_G, DO JROC=1,NPROC CALL PE2SET(JROC,ISETA,ISETB,ISETW,ISETV) ! total recv size is # points per field * # fields - IRECVTOT(JROC) = IRECV_WSET_SIZE(ISETW)*IRECV_FIELD_COUNT(ISETV) + IRECVTOT(JROC) = 1_JPIB*IRECV_WSET_SIZE(ISETW)*IRECV_FIELD_COUNT(ISETV) ENDDO ! Prepare sender arrays @@ -476,7 +482,7 @@ SUBROUTINE TRLTOG(ALLOCATOR,HTRLTOG,PREEL_REAL,KF_FS,KF_GP,KF_UV_G,KF_SCALARS_G, IPOS = IPOS+1 ! offset to first layer of this gridpoint IIN_TO_SEND_BUFR(IIN_TO_SEND_BUFR_OFFSET(JROC)+IPOS,1) = & - & KF_FS*D%NSTAGTF(IGLL)+(D%NSTA(IGL,ISETB)-1)+(JL-1) + & 1_JPIB*KF_FS*D%NSTAGTF(IGLL)+(D%NSTA(IGL,ISETB)-1)+(JL-1) ! distance between two layers of this gridpoint IIN_TO_SEND_BUFR(IIN_TO_SEND_BUFR_OFFSET(JROC)+IPOS,2) = & & D%NSTAGTF(IGLL+1)-D%NSTAGTF(IGLL) @@ -485,6 +491,7 @@ SUBROUTINE TRLTOG(ALLOCATOR,HTRLTOG,PREEL_REAL,KF_FS,KF_GP,KF_UV_G,KF_SCALARS_G, !we always receive the full fourier space ISENDTOT(JROC) = IPOS*KF_FS ENDDO + LLOCAL_CONTRIBUTION = ISENDTOT(MYPROC) > 0 #ifdef OMPGPU #endif @@ -532,7 +539,7 @@ SUBROUTINE TRLTOG(ALLOCATOR,HTRLTOG,PREEL_REAL,KF_FS,KF_GP,KF_UV_G,KF_SCALARS_G, CALL GSTATS(1806,1) ! Copy local contribution - IF(ISENDTOT(MYPROC) > 0) THEN + IF(LLOCAL_CONTRIBUTION) THEN ! I have to send something to myself... ! Input is KF_GP fields. We find the resulting KF_FS fields. @@ -611,7 +618,6 @@ SUBROUTINE TRLTOG(ALLOCATOR,HTRLTOG,PREEL_REAL,KF_FS,KF_GP,KF_UV_G,KF_SCALARS_G, #ifdef ACCGPU !$ACC END DATA #endif - ENDIF ! Figure out processes that send or recv something @@ -645,12 +651,12 @@ SUBROUTINE TRLTOG(ALLOCATOR,HTRLTOG,PREEL_REAL,KF_FS,KF_GP,KF_UV_G,KF_SCALARS_G, IF (IRECV_COUNTS > 0) THEN CALL ASSIGN_PTR(ZCOMBUFR, GET_ALLOCATION(ALLOCATOR, HTRLTOG%HCOMBUFR_AND_COMBUFS),& - & 1_C_SIZE_T, INT(ICOMBUFR_OFFSET(IRECV_COUNTS+1),KIND=C_SIZE_T)*C_SIZEOF(ZCOMBUFR(1))) + & 1_JPIB, ICOMBUFR_OFFSET(IRECV_COUNTS+1)*C_SIZEOF(ZCOMBUFR(1))) ENDIF IF (ISEND_COUNTS > 0) THEN CALL ASSIGN_PTR(ZCOMBUFS, GET_ALLOCATION(ALLOCATOR, HTRLTOG%HCOMBUFR_AND_COMBUFS),& - & ALIGN(INT(KF_GP*D%NGPTOT,KIND=C_SIZE_T)*C_SIZEOF(ZCOMBUFR(1)),128)+1_C_SIZE_T, & - & INT(ICOMBUFS_OFFSET(ISEND_COUNTS+1),KIND=C_SIZE_T)*C_SIZEOF(ZCOMBUFS(1))) + & ALIGN(1_JPIB*KF_GP*D%NGPTOT*C_SIZEOF(ZCOMBUFR(1)),128)+1, & + & ICOMBUFS_OFFSET(ISEND_COUNTS+1)*C_SIZEOF(ZCOMBUFS(1))) ENDIF #ifdef OMPGPU @@ -710,12 +716,24 @@ SUBROUTINE TRLTOG(ALLOCATOR,HTRLTOG,PREEL_REAL,KF_FS,KF_GP,KF_UV_G,KF_SCALARS_G, !! this is safe-but-slow fallback for running without GPU-aware MPI !$ACC UPDATE HOST(ZCOMBUFS) IF(ISEND_COUNTS > 0) #endif + + ! Skip the own contribution because this is ok to overflow + ISENDTOT(MYPROC) = 0 + IRECVTOT(MYPROC) = 0 + + ISENDTOT_MPI = ISENDTOT + IRECVTOT_MPI = IRECVTOT + IF (ANY(ISENDTOT_MPI /= ISENDTOT)) & + & CALL MPL_ABORT("Overflow in trltog") + IF (ANY(IRECVTOT_MPI /= IRECVTOT)) & + & CALL MPL_ABORT("Overflow in trltog") + DO INR=1,IRECV_COUNTS IR=IR+1 IRECV=IRECV_TO_PROC(INR) #if ECTRANS_HAVE_MPI CALL MPI_IRECV(ZCOMBUFR(ICOMBUFR_OFFSET(INR)+1:ICOMBUFR_OFFSET(INR+1)), & - & IRECVTOT(IRECV), & + & IRECVTOT_MPI(IRECV), & & TRLTOG_DTYPE,NPRCIDS(IRECV)-1, & & MTAGLG, LOCAL_COMM, IREQUEST(IR), & & IERROR ) @@ -730,7 +748,7 @@ SUBROUTINE TRLTOG(ALLOCATOR,HTRLTOG,PREEL_REAL,KF_FS,KF_GP,KF_UV_G,KF_SCALARS_G, IR=IR+1 ISEND=ISEND_TO_PROC(INS) #if ECTRANS_HAVE_MPI - CALL MPI_ISEND(ZCOMBUFS(ICOMBUFS_OFFSET(INS)+1:ICOMBUFS_OFFSET(INS+1)),ISENDTOT(ISEND), & + CALL MPI_ISEND(ZCOMBUFS(ICOMBUFS_OFFSET(INS)+1:ICOMBUFS_OFFSET(INS+1)),ISENDTOT_MPI(ISEND), & & TRLTOG_DTYPE, NPRCIDS(ISEND)-1,MTAGLG,LOCAL_COMM,IREQUEST(IR),IERROR) IREQ(IR) = IREQUEST(IR)%MPI_VAL #else diff --git a/src/trans/gpu/internal/trltom_mod.F90 b/src/trans/gpu/internal/trltom_mod.F90 index 447ea5659..2b0864196 100755 --- a/src/trans/gpu/internal/trltom_mod.F90 +++ b/src/trans/gpu/internal/trltom_mod.F90 @@ -21,7 +21,7 @@ MODULE TRLTOM_MOD END TYPE CONTAINS FUNCTION PREPARE_TRLTOM(ALLOCATOR, KF_FS) RESULT(HTRLTOM) - USE PARKIND_ECTRANS, ONLY: JPIM, JPRBT + USE PARKIND_ECTRANS, ONLY: JPIM, JPRBT, JPIB USE TPM_DISTR, ONLY: D USE BUFFERED_ALLOCATOR_MOD, ONLY: BUFFERED_ALLOCATOR, RESERVE USE ISO_C_BINDING, ONLY: C_SIZE_T, C_SIZEOF @@ -34,7 +34,7 @@ FUNCTION PREPARE_TRLTOM(ALLOCATOR, KF_FS) RESULT(HTRLTOM) REAL(KIND=JPRBT) :: DUMMY - HTRLTOM%HPFBUF = RESERVE(ALLOCATOR, INT(D%NLENGT1B*2*KF_FS,KIND=C_SIZE_T)*C_SIZEOF(DUMMY)) + HTRLTOM%HPFBUF = RESERVE(ALLOCATOR, 2_JPIB*D%NLENGT1B*KF_FS*C_SIZEOF(DUMMY), "HTRLTOM%HPFBUF") END FUNCTION SUBROUTINE TRLTOM(ALLOCATOR,HTRLTOM,PFBUF_IN,PFBUF,KF_FS) @@ -88,7 +88,7 @@ SUBROUTINE TRLTOM(ALLOCATOR,HTRLTOM,PFBUF_IN,PFBUF,KF_FS) ! Y.Seity : 07-08-30 Add barrier synchronisation under LSYNC_TRANS ! ------------------------------------------------------------------ - USE PARKIND_ECTRANS, ONLY: JPIM, JPRBT + USE PARKIND_ECTRANS, ONLY: JPIM, JPRBT, JPIB USE YOMHOOK, ONLY: LHOOK, DR_HOOK, JPHOOK USE MPL_MODULE, ONLY: MPL_ALLTOALLV, MPL_BARRIER, MPL_ALL_MS_COMM, MPL_MYRANK USE TPM_DISTR, ONLY: D, NPRTRW, NPROC, MYPROC, MYSETW @@ -109,7 +109,8 @@ SUBROUTINE TRLTOM(ALLOCATOR,HTRLTOM,PFBUF_IN,PFBUF,KF_FS) REAL(KIND=JPRBT) ,INTENT(INOUT), POINTER :: PFBUF_IN(:) INTEGER(KIND=JPIM) :: ILENS(NPRTRW),IOFFS(NPRTRW),ILENR(NPRTRW),IOFFR(NPRTRW) - INTEGER(KIND=JPIM) :: J, ILEN, ISTA, FROM_SEND, TO_SEND, FROM_RECV, TO_RECV, IRANK + INTEGER(KIND=JPIM) :: J, FROM_SEND, TO_SEND, FROM_RECV, TO_RECV, IRANK + INTEGER(KIND=JPIB) :: JPOS, ISTA, IEND, ILEN REAL(KIND=JPHOOK) :: ZHOOK_HANDLE INTEGER(KIND=JPIM) :: IERROR @@ -132,7 +133,7 @@ SUBROUTINE TRLTOM(ALLOCATOR,HTRLTOM,PFBUF_IN,PFBUF,KF_FS) IF (LHOOK) CALL DR_HOOK('TRLTOM',0,ZHOOK_HANDLE) CALL ASSIGN_PTR(PFBUF, GET_ALLOCATION(ALLOCATOR, HTRLTOM%HPFBUF),& - & 1_C_SIZE_T, INT(D%NLENGT1B*2*KF_FS,KIND=C_SIZE_T)*C_SIZEOF(PFBUF(1))) + & 1_JPIB, 2_JPIB*D%NLENGT1B*KF_FS*C_SIZEOF(PFBUF(1))) #ifdef OMPGPU #endif @@ -221,16 +222,17 @@ SUBROUTINE TRLTOM(ALLOCATOR,HTRLTOM,PFBUF_IN,PFBUF,KF_FS) #endif CALL GSTATS(806,1) ELSE - ILEN = D%NLTSGTB(MYSETW)*2*KF_FS - ISTA = D%NSTAGT1B(MYSETW)*2*KF_FS+1 + ILEN = 2_JPIB*D%NLTSGTB(MYSETW)*KF_FS + ISTA = 2_JPIB*D%NSTAGT1B(MYSETW)*KF_FS+1 + IEND = ISTA+ILEN-1 CALL GSTATS(1607,0) #ifdef OMPGPU #endif #ifdef ACCGPU - !$ACC PARALLEL LOOP DEFAULT(NONE) FIRSTPRIVATE(ISTA,ILEN) + !$ACC PARALLEL LOOP DEFAULT(NONE) FIRSTPRIVATE(ISTA,IEND) #endif - DO J=ISTA,ISTA+ILEN-1 - PFBUF(J) = PFBUF_IN(J) + DO JPOS=ISTA,IEND + PFBUF(JPOS) = PFBUF_IN(JPOS) ENDDO CALL GSTATS(1607,1) ENDIF diff --git a/src/trans/gpu/internal/trltom_pack_unpack.F90 b/src/trans/gpu/internal/trltom_pack_unpack.F90 index d77da0f80..25f5bd564 100755 --- a/src/trans/gpu/internal/trltom_pack_unpack.F90 +++ b/src/trans/gpu/internal/trltom_pack_unpack.F90 @@ -30,7 +30,7 @@ MODULE TRLTOM_PACK_UNPACK CONTAINS FUNCTION PREPARE_TRLTOM_PACK(ALLOCATOR, KF_FS) RESULT(HTRLTOM_PACK) - USE PARKIND_ECTRANS, ONLY: JPIM, JPRBT + USE PARKIND_ECTRANS, ONLY: JPIM, JPRBT, JPIB USE TPM_DISTR, ONLY: D USE ISO_C_BINDING, ONLY: C_SIZE_T, C_SIZEOF USE BUFFERED_ALLOCATOR_MOD, ONLY: BUFFERED_ALLOCATOR, RESERVE @@ -43,7 +43,7 @@ FUNCTION PREPARE_TRLTOM_PACK(ALLOCATOR, KF_FS) RESULT(HTRLTOM_PACK) REAL(KIND=JPRBT) :: DUMMY - HTRLTOM_PACK%HFOUBUF_IN = RESERVE(ALLOCATOR, INT(D%NLENGT0B*KF_FS*2,KIND=C_SIZE_T)*C_SIZEOF(DUMMY)) + HTRLTOM_PACK%HFOUBUF_IN = RESERVE(ALLOCATOR, 2_JPIB*D%NLENGT0B*KF_FS*C_SIZEOF(DUMMY), "HTRLTOM_PACK%HFOUBUF_IN") END FUNCTION PREPARE_TRLTOM_PACK SUBROUTINE TRLTOM_PACK(ALLOCATOR,HTRLTOM_PACK,PREEL_COMPLEX,FOUBUF_IN,KF_FS) @@ -70,7 +70,7 @@ SUBROUTINE TRLTOM_PACK(ALLOCATOR,HTRLTOM_PACK,PREEL_COMPLEX,FOUBUF_IN,KF_FS) ! ------------------------------------------------------------------ USE BUFFERED_ALLOCATOR_MOD, ONLY: BUFFERED_ALLOCATOR, ASSIGN_PTR, GET_ALLOCATION - USE PARKIND_ECTRANS, ONLY: JPIM, JPRBT + USE PARKIND_ECTRANS, ONLY: JPIM, JPRBT, JPIB USE TPM_DISTR, ONLY: D, MYSETW, D_NSTAGTF, D_NPNTGTB0, D_NPTRLS, D_NDGL_FS USE TPM_GEOMETRY, ONLY: G_NMEN, G_NLOEN USE TPM_DIM, ONLY: R_NSMAX @@ -85,12 +85,13 @@ SUBROUTINE TRLTOM_PACK(ALLOCATOR,HTRLTOM_PACK,PREEL_COMPLEX,FOUBUF_IN,KF_FS) TYPE(BUFFERED_ALLOCATOR), INTENT(IN) :: ALLOCATOR TYPE(TRLTOM_PACK_HANDLE), INTENT(IN) :: HTRLTOM_PACK - INTEGER(KIND=JPIM) :: JM,JF,IGLG,ISTA,OFFSET_VAR,IOFF_LAT,KGL + INTEGER(KIND=JPIM) :: JM,JF,IGLG,OFFSET_VAR,KGL + INTEGER(KIND=JPIB) :: IOFF_LAT,ISTA REAL(KIND=JPRBT) :: SCAL CALL ASSIGN_PTR(FOUBUF_IN, GET_ALLOCATION(ALLOCATOR, HTRLTOM_PACK%HFOUBUF_IN),& - & 1_C_SIZE_T, INT(D%NLENGT0B*KF_FS*2,KIND=C_SIZE_T)*C_SIZEOF(FOUBUF_IN(1))) + & 1_JPIB, 2_JPIB*D%NLENGT0B*KF_FS*C_SIZEOF(FOUBUF_IN(1))) #ifdef OMPGPU #endif @@ -116,7 +117,7 @@ SUBROUTINE TRLTOM_PACK(ALLOCATOR,HTRLTOM_PACK,PREEL_COMPLEX,FOUBUF_IN,KF_FS) IOFF_LAT = KF_FS*D_NSTAGTF(KGL)+(JF-1)*(D_NSTAGTF(KGL+1)-D_NSTAGTF(KGL)) SCAL = 1._JPRBT/REAL(G_NLOEN(IGLG),JPRBT) - ISTA = D_NPNTGTB0(JM,KGL)*KF_FS*2 + ISTA = 2_JPIB*D_NPNTGTB0(JM,KGL)*KF_FS FOUBUF_IN(ISTA+2*JF-1) = SCAL * PREEL_COMPLEX(IOFF_LAT+2*JM+1) FOUBUF_IN(ISTA+2*JF ) = SCAL * PREEL_COMPLEX(IOFF_LAT+2*JM+2) @@ -134,7 +135,7 @@ SUBROUTINE TRLTOM_PACK(ALLOCATOR,HTRLTOM_PACK,PREEL_COMPLEX,FOUBUF_IN,KF_FS) END SUBROUTINE TRLTOM_PACK FUNCTION PREPARE_TRLTOM_UNPACK(ALLOCATOR, KF_FS) RESULT(HTRLTOM_UNPACK) - USE PARKIND_ECTRANS, ONLY: JPIM, JPRBT, JPRD + USE PARKIND_ECTRANS, ONLY: JPIM, JPRBT, JPRD, JPIB USE BUFFERED_ALLOCATOR_MOD, ONLY: BUFFERED_ALLOCATOR, RESERVE USE LEDIR_MOD, ONLY: LEDIR_STRIDES USE ISO_C_BINDING, ONLY: C_SIZE_T, C_SIZEOF @@ -145,9 +146,10 @@ FUNCTION PREPARE_TRLTOM_UNPACK(ALLOCATOR, KF_FS) RESULT(HTRLTOM_UNPACK) INTEGER(KIND=JPIM), INTENT(IN) :: KF_FS TYPE(TRLTOM_UNPACK_HANDLE) :: HTRLTOM_UNPACK - INTEGER(KIND=JPIM) :: IIN_STRIDES0, IIN_SIZE + INTEGER(KIND=JPIM) :: IIN_STRIDES0 + INTEGER(KIND=JPIB) :: IIN_SIZE INTEGER(KIND=JPIM) :: IIN0_STRIDES0, IIN0_SIZE - INTEGER(KIND=C_SIZE_T) :: ISIZE + INTEGER(KIND=JPIB) :: ISIZE REAL(KIND=JPRBT) :: ZPRBT_DUMMY REAL(KIND=JPRD) :: ZPRD_DUMMY @@ -156,16 +158,16 @@ FUNCTION PREPARE_TRLTOM_UNPACK(ALLOCATOR, KF_FS) RESULT(HTRLTOM_UNPACK) IIN0_STRIDES0=IIN0_STRIDES0,IIN0_SIZE=IIN0_SIZE) ! Check if the reuse buffer is large enough - ISIZE = ALIGN(INT(IIN_SIZE,KIND=C_SIZE_T)*C_SIZEOF(ZPRBT_DUMMY),128) - ISIZE = ISIZE + ALIGN(INT(IIN_SIZE,KIND=C_SIZE_T)*C_SIZEOF(ZPRBT_DUMMY),128) - ISIZE = ISIZE + ALIGN(INT(IIN0_SIZE,KIND=C_SIZE_T)*C_SIZEOF(ZPRD_DUMMY),128) - ISIZE = ISIZE + ALIGN(INT(IIN0_SIZE,KIND=C_SIZE_T)*C_SIZEOF(ZPRD_DUMMY),128) + ISIZE = ALIGN(IIN_SIZE*C_SIZEOF(ZPRBT_DUMMY),128) + ISIZE = ISIZE + ALIGN(IIN_SIZE*C_SIZEOF(ZPRBT_DUMMY),128) + ISIZE = ISIZE + ALIGN(IIN0_SIZE*C_SIZEOF(ZPRD_DUMMY),128) + ISIZE = ISIZE + ALIGN(IIN0_SIZE*C_SIZEOF(ZPRD_DUMMY),128) - HTRLTOM_UNPACK%HINPS_AND_ZINPA = RESERVE(ALLOCATOR, ISIZE) + HTRLTOM_UNPACK%HINPS_AND_ZINPA = RESERVE(ALLOCATOR, ISIZE, "HTRLTOM_UNPACK%HINPS_AND_ZINPA") END FUNCTION PREPARE_TRLTOM_UNPACK SUBROUTINE TRLTOM_UNPACK(ALLOCATOR,HTRLTOM_UNPACK,FOUBUF,ZINPS,ZINPA,ZINPS0,ZINPA0,KF_FS,KF_UV) - USE PARKIND_ECTRANS, ONLY: JPIM, JPRBT, JPRD + USE PARKIND_ECTRANS, ONLY: JPIM, JPRBT, JPRD, JPIB USE TPM_DIM, ONLY: R_NDGNH, R_NDGL USE TPM_GEOMETRY, ONLY: G_NDGLU USE BUFFERED_ALLOCATOR_MOD, ONLY: BUFFERED_ALLOCATOR, ASSIGN_PTR, GET_ALLOCATION @@ -185,13 +187,14 @@ SUBROUTINE TRLTOM_UNPACK(ALLOCATOR,HTRLTOM_UNPACK,FOUBUF,ZINPS,ZINPA,ZINPS0,ZINP REAL(KIND=JPRBT), POINTER :: PREEL_COMPLEX(:) - INTEGER(KIND=JPIM) :: IIN_STRIDES0, IIN_SIZE + INTEGER(KIND=JPIM) :: IIN_STRIDES0 + INTEGER(KIND=JPIB) :: IIN_SIZE INTEGER(KIND=JPIM) :: IIN0_STRIDES0, IIN0_SIZE - INTEGER(KIND=C_SIZE_T) :: IALLOC_POS, IALLOC_SZ + INTEGER(KIND=JPIB) :: IALLOC_POS, IALLOC_SZ - INTEGER(KIND=8) :: JF - INTEGER(KIND=JPIM) :: KM, ISL, IGLS, OFFSET1, OFFSET2, JGL, KMLOC + INTEGER(KIND=JPIB) :: JF, OFFSET1, OFFSET2 + INTEGER(KIND=JPIM) :: KM, ISL, IGLS, JGL, KMLOC REAL(KIND=JPRBT) :: PAIA, PAIS @@ -200,22 +203,22 @@ SUBROUTINE TRLTOM_UNPACK(ALLOCATOR,HTRLTOM_UNPACK,FOUBUF,ZINPS,ZINPA,ZINPS0,ZINP IALLOC_POS=1 - IALLOC_SZ = ALIGN(INT(IIN_SIZE,KIND=C_SIZE_T)*C_SIZEOF(ZINPS(0)),128) + IALLOC_SZ = ALIGN(IIN_SIZE*C_SIZEOF(ZINPS(0)),128) CALL ASSIGN_PTR(ZINPS, GET_ALLOCATION(ALLOCATOR, HTRLTOM_UNPACK%HINPS_AND_ZINPA),& & IALLOC_POS, IALLOC_SZ) IALLOC_POS=IALLOC_POS+IALLOC_SZ - IALLOC_SZ = ALIGN(INT(IIN_SIZE,KIND=C_SIZE_T)*C_SIZEOF(ZINPA(0)),128) + IALLOC_SZ = ALIGN(IIN_SIZE*C_SIZEOF(ZINPA(0)),128) CALL ASSIGN_PTR(ZINPA, GET_ALLOCATION(ALLOCATOR, HTRLTOM_UNPACK%HINPS_AND_ZINPA),& & IALLOC_POS, IALLOC_SZ) IALLOC_POS=IALLOC_POS+IALLOC_SZ - IALLOC_SZ = ALIGN(INT(IIN0_SIZE,KIND=C_SIZE_T)*C_SIZEOF(ZINPS0(0)),128) + IALLOC_SZ = ALIGN(IIN0_SIZE*C_SIZEOF(ZINPS0(0)),128) CALL ASSIGN_PTR(ZINPS0, GET_ALLOCATION(ALLOCATOR, HTRLTOM_UNPACK%HINPS_AND_ZINPA),& & IALLOC_POS, IALLOC_SZ) IALLOC_POS=IALLOC_POS+IALLOC_SZ - IALLOC_SZ = ALIGN(INT(IIN0_SIZE,KIND=C_SIZE_T)*C_SIZEOF(ZINPA0(0)),128) + IALLOC_SZ = ALIGN(IIN0_SIZE*C_SIZEOF(ZINPA0(0)),128) CALL ASSIGN_PTR(ZINPA0, GET_ALLOCATION(ALLOCATOR, HTRLTOM_UNPACK%HINPS_AND_ZINPA),& & IALLOC_POS, IALLOC_SZ) IALLOC_POS=IALLOC_POS+IALLOC_SZ @@ -240,8 +243,8 @@ SUBROUTINE TRLTOM_UNPACK(ALLOCATOR,HTRLTOM_UNPACK,FOUBUF,ZINPS,ZINPA,ZINPS0,ZINP IF (JGL >= ISL) THEN !(DO JGL=ISL,R_NDGNH) IGLS = R_NDGL+1-JGL - OFFSET1 = D_NPNTGTB1(KMLOC,JGL )*2*KF_FS - OFFSET2 = D_NPNTGTB1(KMLOC,IGLS)*2*KF_FS + OFFSET1 = 2_JPIB*D_NPNTGTB1(KMLOC,JGL )*KF_FS + OFFSET2 = 2_JPIB*D_NPNTGTB1(KMLOC,IGLS)*KF_FS PAIA = FOUBUF(OFFSET1+JF)-FOUBUF(OFFSET2+JF) PAIS = FOUBUF(OFFSET1+JF)+FOUBUF(OFFSET2+JF) IF (JF <= 4*KF_UV) THEN diff --git a/src/trans/gpu/internal/trmtol_mod.F90 b/src/trans/gpu/internal/trmtol_mod.F90 index 9b6745218..fbb48538f 100755 --- a/src/trans/gpu/internal/trmtol_mod.F90 +++ b/src/trans/gpu/internal/trmtol_mod.F90 @@ -21,7 +21,7 @@ MODULE TRMTOL_MOD END TYPE CONTAINS FUNCTION PREPARE_TRMTOL(ALLOCATOR, KF_LEG) RESULT(HTRMTOL) - USE PARKIND_ECTRANS, ONLY: JPIM, JPRBT + USE PARKIND_ECTRANS, ONLY: JPIM, JPRBT, JPIB USE TPM_DISTR, ONLY: D USE BUFFERED_ALLOCATOR_MOD, ONLY: BUFFERED_ALLOCATOR, RESERVE USE ISO_C_BINDING, ONLY: C_SIZE_T, C_SIZEOF @@ -34,7 +34,7 @@ FUNCTION PREPARE_TRMTOL(ALLOCATOR, KF_LEG) RESULT(HTRMTOL) REAL(KIND=JPRBT) :: DUMMY - HTRMTOL%HPFBUF = RESERVE(ALLOCATOR, INT(D%NLENGT0B*2*KF_LEG,KIND=C_SIZE_T)*C_SIZEOF(DUMMY)) + HTRMTOL%HPFBUF = RESERVE(ALLOCATOR, 2_JPIB*D%NLENGT0B*KF_LEG*C_SIZEOF(DUMMY), "HTRMTOL%HPFBUF") END FUNCTION SUBROUTINE TRMTOL(ALLOCATOR,HTRMTOL,PFBUF_IN,PFBUF,KF_LEG) @@ -88,7 +88,7 @@ SUBROUTINE TRMTOL(ALLOCATOR,HTRMTOL,PFBUF_IN,PFBUF,KF_LEG) ! Y.Seity : 07-08-31 add barrier synchronisation under LSYNC_TRANS ! ------------------------------------------------------------------ - USE PARKIND_ECTRANS, ONLY: JPIM, JPRBT + USE PARKIND_ECTRANS, ONLY: JPIM, JPRBT, JPIB USE YOMHOOK, ONLY: LHOOK, DR_HOOK, JPHOOK USE MPL_MODULE, ONLY: MPL_ALLTOALLV, MPL_BARRIER, MPL_ALL_MS_COMM, MPL_MYRANK USE TPM_DISTR, ONLY: D, NPRTRW, NPROC, MYPROC, MYSETW @@ -109,7 +109,8 @@ SUBROUTINE TRMTOL(ALLOCATOR,HTRMTOL,PFBUF_IN,PFBUF,KF_LEG) REAL(KIND=JPRBT), INTENT(IN) :: PFBUF_IN(:) INTEGER(KIND=JPIM) :: ILENS(NPRTRW),IOFFS(NPRTRW),ILENR(NPRTRW),IOFFR(NPRTRW) - INTEGER(KIND=JPIM) :: J, ILEN, ISTA, FROM_SEND, TO_SEND, FROM_RECV, TO_RECV, IRANK + INTEGER(KIND=JPIM) :: J, FROM_SEND, TO_SEND, FROM_RECV, TO_RECV, IRANK + INTEGER(KIND=JPIB) :: JPOS, ISTA, IEND, ILEN REAL(KIND=JPHOOK) :: ZHOOK_HANDLE INTEGER(KIND=JPIM) :: IERROR @@ -133,7 +134,7 @@ SUBROUTINE TRMTOL(ALLOCATOR,HTRMTOL,PFBUF_IN,PFBUF,KF_LEG) IF (LHOOK) CALL DR_HOOK('TRMTOL',0,ZHOOK_HANDLE) CALL ASSIGN_PTR(PFBUF, GET_ALLOCATION(ALLOCATOR, HTRMTOL%HPFBUF),& - & 1_C_SIZE_T, INT(D%NLENGT0B*2*KF_LEG,KIND=C_SIZE_T)*C_SIZEOF(PFBUF(1))) + & 1_JPIB, 2_JPIB*D%NLENGT0B*KF_LEG*C_SIZEOF(PFBUF(1))) IF(NPROC > 1) THEN DO J=1,NPRTRW @@ -226,16 +227,17 @@ SUBROUTINE TRMTOL(ALLOCATOR,HTRMTOL,PFBUF_IN,PFBUF,KF_LEG) #endif CALL GSTATS(807,1) ELSE - ILEN = D%NLTSGTB(MYSETW)*2*KF_LEG - ISTA = D%NSTAGT0B(MYSETW)*2*KF_LEG+1 + ILEN = 2_JPIB*D%NLTSGTB(MYSETW)*KF_LEG + ISTA = 2_JPIB*D%NSTAGT0B(MYSETW)*KF_LEG+1 + IEND = ISTA+ILEN-1 CALL GSTATS(1608,0) #ifdef OMPGPU #endif #ifdef ACCGPU - !$ACC PARALLEL LOOP DEFAULT(NONE) PRESENT(PFBUF,PFBUF_IN) FIRSTPRIVATE(ISTA,ILEN) + !$ACC PARALLEL LOOP DEFAULT(NONE) PRESENT(PFBUF,PFBUF_IN) FIRSTPRIVATE(ISTA,IEND) #endif - DO J=ISTA,ISTA+ILEN-1 - PFBUF(J) = PFBUF_IN(J) + DO JPOS=ISTA,IEND + PFBUF(JPOS) = PFBUF_IN(JPOS) ENDDO CALL GSTATS(1608,1) ENDIF diff --git a/src/trans/gpu/internal/trmtol_pack_unpack.F90 b/src/trans/gpu/internal/trmtol_pack_unpack.F90 index 4468af603..cefa8c2b3 100755 --- a/src/trans/gpu/internal/trmtol_pack_unpack.F90 +++ b/src/trans/gpu/internal/trmtol_pack_unpack.F90 @@ -26,7 +26,7 @@ MODULE TRMTOL_PACK_UNPACK CONTAINS FUNCTION PREPARE_TRMTOL_PACK(ALLOCATOR,KF_LEG) RESULT(HTRMTOL_PACK) - USE PARKIND_ECTRANS, ONLY: JPIM, JPRBT + USE PARKIND_ECTRANS, ONLY: JPIM, JPRBT, JPIB USE TPM_DISTR, ONLY: D USE ISO_C_BINDING, ONLY: C_SIZE_T, C_SIZEOF USE BUFFERED_ALLOCATOR_MOD, ONLY: BUFFERED_ALLOCATOR, RESERVE @@ -38,12 +38,12 @@ FUNCTION PREPARE_TRMTOL_PACK(ALLOCATOR,KF_LEG) RESULT(HTRMTOL_PACK) TYPE(TRMTOL_PACK_HANDLE) :: HTRMTOL_PACK - INTEGER(KIND=C_SIZE_T) :: IALLOC_SZ + INTEGER(KIND=JPIB) :: IALLOC_SZ REAL(KIND=JPRBT) :: ZPRBT_DUMMY - IALLOC_SZ = INT(D%NLENGT1B*2*KF_LEG,KIND=C_SIZE_T)*C_SIZEOF(ZPRBT_DUMMY) - HTRMTOL_PACK%HFOUBUF_IN = RESERVE(ALLOCATOR, int(IALLOC_SZ,kind=c_size_t)) + IALLOC_SZ = 2_JPIB*D%NLENGT1B*KF_LEG*C_SIZEOF(ZPRBT_DUMMY) + HTRMTOL_PACK%HFOUBUF_IN = RESERVE(ALLOCATOR, IALLOC_SZ, "HTRMTOL_PACK%HFOUBUF_IN") END FUNCTION SUBROUTINE TRMTOL_PACK(ALLOCATOR,HTRMTOL_PACK,ZOUTS,ZOUTA,ZOUTS0,ZOUTA0,FOUBUF_IN,KF_LEG) @@ -84,7 +84,7 @@ SUBROUTINE TRMTOL_PACK(ALLOCATOR,HTRMTOL_PACK,ZOUTS,ZOUTA,ZOUTS0,ZOUTA0,FOUBUF_I ! F. Vana 05-Mar-2015 Support for single precision ! ------------------------------------------------------------------ - USE PARKIND_ECTRANS, ONLY: JPIM, JPRB, JPRBT, JPRD + USE PARKIND_ECTRANS, ONLY: JPIM, JPRB, JPRBT, JPRD, JPIB USE YOMHOOK, ONLY: LHOOK, DR_HOOK, JPHOOK USE TPM_DIM, ONLY: R_NDGNH, R_NDGL USE TPM_GEOMETRY, ONLY: G_NDGLU @@ -107,8 +107,10 @@ SUBROUTINE TRMTOL_PACK(ALLOCATOR,HTRMTOL_PACK,ZOUTS,ZOUTA,ZOUTS0,ZOUTA0,FOUBUF_I ! LOCAL REAL(KIND=JPRBT) :: ZAOA, ZSOA - INTEGER(KIND=JPIM) :: KMLOC, KM, ISL, JGL, JK, IGLS, OFFSET1, OFFSET2 - INTEGER(KIND=JPIM) :: IOUT_STRIDES0, IOUT_SIZE + INTEGER(KIND=JPIM) :: KMLOC, KM, ISL, JGL, JK, IGLS + INTEGER(KIND=JPIB) :: OFFSET1, OFFSET2 + INTEGER(KIND=JPIM) :: IOUT_STRIDES0 + INTEGER(KIND=JPIB) :: IOUT_SIZE INTEGER(KIND=JPIM) :: IOUT0_STRIDES0, IOUT0_SIZE REAL(KIND=JPHOOK) :: ZHOOK_HANDLE @@ -116,7 +118,7 @@ SUBROUTINE TRMTOL_PACK(ALLOCATOR,HTRMTOL_PACK,ZOUTS,ZOUTA,ZOUTS0,ZOUTA0,FOUBUF_I IF (LHOOK) CALL DR_HOOK('TRMTOL_PACK',0,ZHOOK_HANDLE) CALL ASSIGN_PTR(FOUBUF_IN, GET_ALLOCATION(ALLOCATOR, HTRMTOL_PACK%HFOUBUF_IN),& - & 1_C_SIZE_T, INT(D%NLENGT1B*2*KF_LEG,KIND=C_SIZE_T)*C_SIZEOF(FOUBUF_IN(1))) + & 1_JPIB, 2_JPIB*D%NLENGT1B*KF_LEG*C_SIZEOF(FOUBUF_IN(1))) CALL LEINV_STRIDES(KF_LEG,IOUT_STRIDES0=IOUT_STRIDES0,IOUT_SIZE=IOUT_SIZE,& IOUT0_STRIDES0=IOUT0_STRIDES0,IOUT0_SIZE=IOUT0_SIZE) @@ -142,8 +144,8 @@ SUBROUTINE TRMTOL_PACK(ALLOCATOR,HTRMTOL_PACK,ZOUTS,ZOUTA,ZOUTS0,ZOUTA0,FOUBUF_I IF (JGL >= ISL) THEN !(DO JGL=ISL,R_NDGNH) IGLS = R_NDGL+1-JGL - OFFSET1 = D_NPNTGTB1(KMLOC,JGL )*2*KF_LEG - OFFSET2 = D_NPNTGTB1(KMLOC,IGLS)*2*KF_LEG + OFFSET1 = 2_JPIB*D_NPNTGTB1(KMLOC,JGL )*KF_LEG + OFFSET2 = 2_JPIB*D_NPNTGTB1(KMLOC,IGLS)*KF_LEG IF(KM /= 0) THEN ZSOA = ZOUTS(JK+(JGL-ISL)*IOUT_STRIDES0+D_OFFSETS_GEMM1(KMLOC)*IOUT_STRIDES0) @@ -177,7 +179,7 @@ SUBROUTINE TRMTOL_PACK(ALLOCATOR,HTRMTOL_PACK,ZOUTS,ZOUTA,ZOUTS0,ZOUTA0,FOUBUF_I END SUBROUTINE TRMTOL_PACK FUNCTION PREPARE_TRMTOL_UNPACK(ALLOCATOR,KF_FS) RESULT(HTRMTOL_UNPACK) - USE PARKIND_ECTRANS, ONLY: JPIM, JPRBT + USE PARKIND_ECTRANS, ONLY: JPIM, JPRBT, JPIB USE TPM_DISTR, ONLY: D USE BUFFERED_ALLOCATOR_MOD, ONLY: BUFFERED_ALLOCATOR, RESERVE USE ISO_C_BINDING, ONLY: C_SIZE_T, C_SIZEOF @@ -191,7 +193,7 @@ FUNCTION PREPARE_TRMTOL_UNPACK(ALLOCATOR,KF_FS) RESULT(HTRMTOL_UNPACK) REAL(KIND=JPRBT) :: DUMMY - HTRMTOL_UNPACK%HREEL = RESERVE(ALLOCATOR, INT(D%NLENGTF*KF_FS,KIND=C_SIZE_T)*C_SIZEOF(DUMMY)) + HTRMTOL_UNPACK%HREEL = RESERVE(ALLOCATOR, 1_JPIB*D%NLENGTF*KF_FS*C_SIZEOF(DUMMY), "HTRMTOL_UNPACK%HREEL") END FUNCTION PREPARE_TRMTOL_UNPACK SUBROUTINE TRMTOL_UNPACK(ALLOCATOR,HTRMTOL_UNPACK,FOUBUF,PREEL_COMPLEX,KF_CURRENT,KF_TOTAL) @@ -223,7 +225,7 @@ SUBROUTINE TRMTOL_UNPACK(ALLOCATOR,HTRMTOL_UNPACK,FOUBUF,PREEL_COMPLEX,KF_CURREN ! ------------------------------------------------------------------ -USE PARKIND_ECTRANS, ONLY: JPIM, JPRBT +USE PARKIND_ECTRANS, ONLY: JPIM, JPRBT, JPIB USE TPM_DISTR, ONLY: D, MYSETW, D_NSTAGTF, D_NPNTGTB0, D_NPTRLS, D_NDGL_FS USE TPM_GEOMETRY, ONLY: G_NMEN, G_NLOEN, G_NLOEN_MAX USE BUFFERED_ALLOCATOR_MOD, ONLY: BUFFERED_ALLOCATOR, ASSIGN_PTR, GET_ALLOCATION @@ -238,11 +240,12 @@ SUBROUTINE TRMTOL_UNPACK(ALLOCATOR,HTRMTOL_UNPACK,FOUBUF,PREEL_COMPLEX,KF_CURREN TYPE(BUFFERED_ALLOCATOR), INTENT(IN) :: ALLOCATOR TYPE(TRMTOL_UNPACK_HANDLE), INTENT(IN) :: HTRMTOL_UNPACK -INTEGER(KIND=JPIM) :: JM,JF,IGLG,ISTA,OFFSET_VAR,IOFF_LAT,KGL +INTEGER(KIND=JPIM) :: JM,JF,IGLG,OFFSET_VAR,KGL +INTEGER(KIND=JPIB) :: IOFF_LAT, ISTA REAL(KIND=JPRBT) :: RET_REAL, RET_COMPLEX CALL ASSIGN_PTR(PREEL_COMPLEX, GET_ALLOCATION(ALLOCATOR, HTRMTOL_UNPACK%HREEL),& - & 1_C_SIZE_T, INT(KF_TOTAL*D%NLENGTF,KIND=C_SIZE_T)*C_SIZEOF(PREEL_COMPLEX(1))) + & 1_JPIB, 1_JPIB*KF_TOTAL*D%NLENGTF*C_SIZEOF(PREEL_COMPLEX(1))) #ifdef OMPGPU #endif @@ -270,12 +273,12 @@ SUBROUTINE TRMTOL_UNPACK(ALLOCATOR,HTRMTOL_UNPACK,FOUBUF,PREEL_COMPLEX,KF_CURREN RET_REAL = 0.0_JPRBT RET_COMPLEX = 0.0_JPRBT IF (JM <= G_NMEN(IGLG)) THEN - ISTA = D_NPNTGTB0(JM,KGL)*KF_CURRENT*2 + ISTA = 2_JPIB*D_NPNTGTB0(JM,KGL)*KF_CURRENT RET_REAL = FOUBUF(ISTA+2*JF-1) RET_COMPLEX = FOUBUF(ISTA+2*JF ) ENDIF - IOFF_LAT = KF_TOTAL*D_NSTAGTF(KGL)+(JF-1)*(D_NSTAGTF(KGL+1)-D_NSTAGTF(KGL)) + IOFF_LAT = 1_JPIB*KF_TOTAL*D_NSTAGTF(KGL)+(JF-1)*(D_NSTAGTF(KGL+1)-D_NSTAGTF(KGL)) PREEL_COMPLEX(IOFF_LAT+2*JM+1) = RET_REAL PREEL_COMPLEX(IOFF_LAT+2*JM+2) = RET_COMPLEX ENDIF From 8495f6039531efe7d9a19afa6b7bdb783e5affac Mon Sep 17 00:00:00 2001 From: Lukas Mosimann Date: Wed, 30 Oct 2024 02:13:47 -0700 Subject: [PATCH 2/3] fix typo --- src/trans/gpu/algor/ext_acc.F90 | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/src/trans/gpu/algor/ext_acc.F90 b/src/trans/gpu/algor/ext_acc.F90 index e550bdad7..49043ab90 100644 --- a/src/trans/gpu/algor/ext_acc.F90 +++ b/src/trans/gpu/algor/ext_acc.F90 @@ -346,7 +346,7 @@ subroutine ext_acc_delete(ptrs, stream) num_ranges = get_common_pointers(ptrs, common_ptrs) do i = 1, num_ranges - call acc_delete(pp, int(common_ptrs(i)%sz)) + call c_f_pointer(common_ptrs(i)%ptr, pp, shape=[common_ptrs(i)%sz/c_sizeof(pp(1))]) !$acc exit data delete(pp) async(stream_act) enddo end subroutine From f214d167b04365778b7e07cef78cd50ee194e860 Mon Sep 17 00:00:00 2001 From: Lukas Mosimann Date: Sun, 8 Dec 2024 22:34:23 -0800 Subject: [PATCH 3/3] missing imports, fix diffs --- src/trans/gpu/external/setup_trans.F90 | 2 +- src/trans/gpu/internal/ftdir_mod.F90 | 2 +- src/trans/gpu/internal/ftinv_mod.F90 | 2 +- src/trans/gpu/internal/trgtol_mod.F90 | 4 +++- src/trans/gpu/internal/trltog_mod.F90 | 1 + 5 files changed, 7 insertions(+), 4 deletions(-) diff --git a/src/trans/gpu/external/setup_trans.F90 b/src/trans/gpu/external/setup_trans.F90 index 5b91a7d95..89427f338 100755 --- a/src/trans/gpu/external/setup_trans.F90 +++ b/src/trans/gpu/external/setup_trans.F90 @@ -98,7 +98,7 @@ SUBROUTINE SETUP_TRANS(KSMAX,KDGL,KDLON,KLOEN,LDSPLIT,PSTRET,& ! R. El Khatib 07-Mar-2016 Better flexibility for Legendre polynomials computation in stretched mode ! ------------------------------------------------------------------ -USE PARKIND1, ONLY: JPIM, JPRB, JPRD, JPIB +USE PARKIND1, ONLY: JPIM, JPRB, JPRD USE PARKIND_ECTRANS, ONLY: JPRBT !ifndef INTERFACE diff --git a/src/trans/gpu/internal/ftdir_mod.F90 b/src/trans/gpu/internal/ftdir_mod.F90 index 54468007a..7d373bbc3 100755 --- a/src/trans/gpu/internal/ftdir_mod.F90 +++ b/src/trans/gpu/internal/ftdir_mod.F90 @@ -22,7 +22,7 @@ MODULE FTDIR_MOD CONTAINS FUNCTION PREPARE_FTDIR(ALLOCATOR,KF_FS) RESULT(HFTDIR) - USE PARKIND_ECTRANS, ONLY: JPIM, JPRBT + USE PARKIND_ECTRANS, ONLY: JPIM, JPRBT, JPIB USE TPM_DISTR, ONLY: D USE BUFFERED_ALLOCATOR_MOD, ONLY: BUFFERED_ALLOCATOR, RESERVE USE ISO_C_BINDING, ONLY: C_SIZE_T, C_SIZEOF diff --git a/src/trans/gpu/internal/ftinv_mod.F90 b/src/trans/gpu/internal/ftinv_mod.F90 index 21de8e95a..ddd07deb0 100755 --- a/src/trans/gpu/internal/ftinv_mod.F90 +++ b/src/trans/gpu/internal/ftinv_mod.F90 @@ -21,7 +21,7 @@ MODULE FTINV_MOD END TYPE CONTAINS FUNCTION PREPARE_FTINV(ALLOCATOR,KF_FS) RESULT(HFTINV) - USE PARKIND_ECTRANS, ONLY: JPIM, JPRBT + USE PARKIND_ECTRANS, ONLY: JPIM, JPRBT, JPIB USE TPM_DISTR, ONLY: D USE BUFFERED_ALLOCATOR_MOD, ONLY: BUFFERED_ALLOCATOR, RESERVE USE ISO_C_BINDING, ONLY: C_SIZE_T, C_SIZEOF diff --git a/src/trans/gpu/internal/trgtol_mod.F90 b/src/trans/gpu/internal/trgtol_mod.F90 index 794c1f99d..d2420a184 100755 --- a/src/trans/gpu/internal/trgtol_mod.F90 +++ b/src/trans/gpu/internal/trgtol_mod.F90 @@ -1,5 +1,6 @@ #define ALIGN(I, A) (((I)+(A)-1)/(A)*(A)) -! (C) Copyright 1995- ECMWF.,KMLOC,KM ! (C) Copyright 1995- Meteo-France. +! (C) Copyright 1995- ECMWF. +! (C) Copyright 1995- Meteo-France. ! (C) Copyright 2022- NVIDIA. ! ! This software is licensed under the terms of the Apache Licence Version 2.0 @@ -717,6 +718,7 @@ SUBROUTINE TRGTOL(ALLOCATOR,HTRGTOL,PREEL_REAL,KF_FS,KF_GP,KF_UV_G,KF_SCALARS_G, #ifdef ACCGPU !$ACC END DATA #endif + ENDIF diff --git a/src/trans/gpu/internal/trltog_mod.F90 b/src/trans/gpu/internal/trltog_mod.F90 index ba17789b9..072ee0144 100755 --- a/src/trans/gpu/internal/trltog_mod.F90 +++ b/src/trans/gpu/internal/trltog_mod.F90 @@ -620,6 +620,7 @@ SUBROUTINE TRLTOG(ALLOCATOR,HTRLTOG,PREEL_REAL,KF_FS,KF_GP,KF_UV_G,KF_SCALARS_G, #ifdef ACCGPU !$ACC END DATA #endif + ENDIF ! Figure out processes that send or recv something