From 08ec0aa208ed11e4d36d0a1205e8a6594f1dd9c3 Mon Sep 17 00:00:00 2001 From: jalvesz Date: Tue, 24 Dec 2024 13:12:50 +0100 Subject: [PATCH 01/18] intrinsics module with fast sums --- src/stdlib_intrinsics.fypp | 159 +++++++++++++++++++++++++++++++++++++ 1 file changed, 159 insertions(+) create mode 100644 src/stdlib_intrinsics.fypp diff --git a/src/stdlib_intrinsics.fypp b/src/stdlib_intrinsics.fypp new file mode 100644 index 000000000..086c58491 --- /dev/null +++ b/src/stdlib_intrinsics.fypp @@ -0,0 +1,159 @@ +#:include "common.fypp" +#:set R_KINDS_TYPES = list(zip(REAL_KINDS, REAL_TYPES, REAL_SUFFIX)) +#:set C_KINDS_TYPES = list(zip(CMPLX_KINDS, CMPLX_TYPES, CMPLX_SUFFIX)) +#:set RANKS = range(2, 2) +module stdlib_intrinsics + use stdlib_kinds + implicit none + private + + interface fsum + #:for rk, rt in REAL_KINDS_TYPES + module procedure :: fsum_1d_${rk}$ + module procedure :: fsum_1d_${rk}$_mask + #:endfor + end interface + public :: fsum + + interface fsum_kahan + #:for rk, rt in REAL_KINDS_TYPES + module procedure :: fsum_kahan_1d_${rk}$ + module procedure :: fsum_kahan_1d_${rk}$_mask + #:endfor + end interface + public :: fsum_kahan + + interface vkahan + #:for rk, rt in REAL_KINDS_TYPES + module procedure :: vkahan_${rk}$ + module procedure :: vkahan_m_${rk}$ + #:endfor + end interface + + #:for k1, t1, s1 in (R_KINDS_TYPES) + ${t1}$, parameter :: zero_${s1}$ = 0._${k1}$ + #:endfor + #:for k1, t1, s1 in (C_KINDS_TYPES) + ${t1}$, parameter :: zero_${s1}$ = (0._${k1}$,0._${k1}$) + #:endfor + integer, parameter :: chunk = 64 + +contains + +#:for k1, t1, s1 in R_KINDS_TYPES +pure function fsum_1d_${s1}$(a) result(s) + ${t1}$, intent(in) :: a(:) + ${t1}$ :: s + ${t1}$ :: abatch(chunk) + integer :: i, dr, rr + ! ----------------------------- + dr = size(a)/chunk + rr = size(a) - dr*chunk + + abatch = zero_${s1}$ + do i = 1, dr + abatch(1:chunk) = abatch(1:chunk) + a(chunk*i-chunk+1:chunk*i) + end do + abatch(1:rr) = abatch(1:rr) + a(size(a)-rr+1:size(a)) + + s = zero_${s1}$ + do i = 1, chunk/2 + s = s + abatch(i)+abatch(chunk/2+i) + end do +end function + +pure function fsum_1d_${s1}$_mask(a,mask) result(s) + ${t1}$, intent(in) :: a(:) + logical, intent(in) :: mask(:) + ${t1}$ :: s + ${t1}$ :: abatch(chunk) + integer :: i, dr, rr + ! ----------------------------- + dr = size(a)/chunk + rr = size(a) - dr*chunk + + abatch = zero_${s1}$ + do i = 1, dr + abatch(1:chunk) = abatch(1:chunk) + merge( zero_${s1}$ , a(chunk*i-chunk+1:chunk*i) , mask(chunk*i-chunk+1:chunk*i) ) + end do + abatch(1:rr) = abatch(1:rr) + merge( zero_${s1}$ , a(size(a)-rr+1:size(a)) , mask(size(a)-rr+1:size(a)) ) + + s = zero_${s1}$ + do i = 1, chunk/2 + s = s + abatch(i)+abatch(chunk/2+i) + end do +end function +#:endfor + +#:for k1, t1, s1 in R_KINDS_TYPES +pure function fsum_kahan_1d_${s1}$(a) result(s) + ${t1}$, intent(in) :: a(:) + ${t1}$ :: s + ${t1}$ :: sbatch(chunk) + ${t1}$ :: cbatch(chunk) + integer :: i, dr, rr + ! ----------------------------- + dr = size(a)/(chunk) + rr = size(a) - dr*chunk + sbatch = zero_${s1}$ + cbatch = zero_${s1}$ + do i = 1, dr + call vkahan( a(chunk*i-chunk+1:chunk*i) , sbatch(1:chunk) , cbatch(1:chunk) ) + end do + call vkahan( a(size(a)-rr+1:size(a)) , sbatch(1:rr) , cbatch(1:rr) ) + + s = zero_${s1}$ + do i = 1,chunk + call vkahan( sbatch(i) , s , cbatch(i) ) + end do +end function + +pure function fsum_kahan_1d_${s1}$_mask(a,mask) result(s) + ${t1}$, intent(in) :: a(:) + logical, intent(in) :: mask(:) + ${t1}$ :: s + ${t1}$ :: sbatch(chunk) + ${t1}$ :: cbatch(chunk) + integer :: i, dr, rr + ! ----------------------------- + dr = size(a)/(chunk) + rr = size(a) - dr*chunk + sbatch = zero_${s1}$ + cbatch = zero_${s1}$ + do i = 1, dr + call vkahan( a(chunk*i-chunk+1:chunk*i) , sbatch(1:chunk) , cbatch(1:chunk) , mask(chunk*i-chunk+1:chunk*i) ) + end do + call vkahan( a(size(a)-rr+1:size(a)) , sbatch(1:rr) , cbatch(1:rr) , mask(size(a)-rr+1:size(a)) ) + + s = zero_${s1}$ + do i = 1,chunk + call vkahan( sbatch(i) , s , cbatch(i) ) + end do +end function +#:endfor + +#:for k1, t1, s1 in R_KINDS_TYPES +elemental subroutine vkahan_${s1}$(a,s,c) + ${t1}$, intent(in) :: a + ${t1}$, intent(inout) :: s + ${t1}$, intent(inout) :: c + ${t1}$ :: t, y + y = a - c + t = s + y + c = (t - s) - y + s = t +end subroutine +elemental subroutine vkahan_m_${s1}$(a,s,c,m) + ${t1}$, intent(in) :: a + ${t1}$, intent(inout) :: s + ${t1}$, intent(inout) :: c + logical, intent(in) :: m + ${t1}$ :: t, y + y = a - c + t = s + y + c = (t - s) - y + s = merge( s , t , m ) +end subroutine +#:endfor + +end module stdlib_intrinsics From 2bc7af92e1923095be5909c64858d0f99a60755f Mon Sep 17 00:00:00 2001 From: jalvesz Date: Thu, 26 Dec 2024 21:04:10 +0100 Subject: [PATCH 02/18] add fast dot_product and start tests --- src/CMakeLists.txt | 1 + src/stdlib_intrinsics.fypp | 100 +++++++++++++++++++++++---- test/CMakeLists.txt | 1 + test/intrinsics/CMakeLists.txt | 7 ++ test/intrinsics/test_intrinsics.fypp | 94 +++++++++++++++++++++++++ 5 files changed, 190 insertions(+), 13 deletions(-) create mode 100644 test/intrinsics/CMakeLists.txt create mode 100644 test/intrinsics/test_intrinsics.fypp diff --git a/src/CMakeLists.txt b/src/CMakeLists.txt index ff9f39417..5f20d9f2a 100644 --- a/src/CMakeLists.txt +++ b/src/CMakeLists.txt @@ -16,6 +16,7 @@ set(fppFiles stdlib_hash_64bit_fnv.fypp stdlib_hash_64bit_pengy.fypp stdlib_hash_64bit_spookyv2.fypp + stdlib_intrinsics.fypp stdlib_io.fypp stdlib_io_npy.fypp stdlib_io_npy_load.fypp diff --git a/src/stdlib_intrinsics.fypp b/src/stdlib_intrinsics.fypp index 086c58491..dcbca0e70 100644 --- a/src/stdlib_intrinsics.fypp +++ b/src/stdlib_intrinsics.fypp @@ -1,32 +1,58 @@ #:include "common.fypp" #:set R_KINDS_TYPES = list(zip(REAL_KINDS, REAL_TYPES, REAL_SUFFIX)) #:set C_KINDS_TYPES = list(zip(CMPLX_KINDS, CMPLX_TYPES, CMPLX_SUFFIX)) -#:set RANKS = range(2, 2) +#:set RC_KINDS_TYPES = R_KINDS_TYPES + C_KINDS_TYPES + +#:def cnjg(type,expression) +#:if 'complex' in type +conjg(${expression}$) +#:else +${expression}$ +#:endif +#:enddef + +! This module is based on https://github.com/jalvesz/fast_math module stdlib_intrinsics + !!Replacement for certain Fortran intrinsic functions offering either faster and/or more accurate implementations. + !! ([Specification](../page/specs/stdlib_intrinsics.html)) use stdlib_kinds implicit none private interface fsum - #:for rk, rt in REAL_KINDS_TYPES - module procedure :: fsum_1d_${rk}$ - module procedure :: fsum_1d_${rk}$_mask + #:for rk, rt, rs in RC_KINDS_TYPES + module procedure :: fsum_1d_${rs}$ + module procedure :: fsum_1d_${rs}$_mask #:endfor end interface public :: fsum interface fsum_kahan - #:for rk, rt in REAL_KINDS_TYPES - module procedure :: fsum_kahan_1d_${rk}$ - module procedure :: fsum_kahan_1d_${rk}$_mask + #:for rk, rt, rs in RC_KINDS_TYPES + module procedure :: fsum_kahan_1d_${rs}$ + module procedure :: fsum_kahan_1d_${rs}$_mask #:endfor end interface public :: fsum_kahan + interface fprod + #:for rk, rt, rs in RC_KINDS_TYPES + module procedure :: fprod_${rs}$ + #:endfor + end interface + public :: fprod + + interface fprod_kahan + #:for rk, rt, rs in RC_KINDS_TYPES + module procedure :: fprod_kahan_${rs}$ + #:endfor + end interface + public :: fprod_kahan + interface vkahan - #:for rk, rt in REAL_KINDS_TYPES - module procedure :: vkahan_${rk}$ - module procedure :: vkahan_m_${rk}$ + #:for rk, rt, rs in RC_KINDS_TYPES + module procedure :: vkahan_${rs}$ + module procedure :: vkahan_m_${rs}$ #:endfor end interface @@ -40,7 +66,7 @@ module stdlib_intrinsics contains -#:for k1, t1, s1 in R_KINDS_TYPES +#:for k1, t1, s1 in RC_KINDS_TYPES pure function fsum_1d_${s1}$(a) result(s) ${t1}$, intent(in) :: a(:) ${t1}$ :: s @@ -85,7 +111,7 @@ pure function fsum_1d_${s1}$_mask(a,mask) result(s) end function #:endfor -#:for k1, t1, s1 in R_KINDS_TYPES +#:for k1, t1, s1 in RC_KINDS_TYPES pure function fsum_kahan_1d_${s1}$(a) result(s) ${t1}$, intent(in) :: a(:) ${t1}$ :: s @@ -132,7 +158,55 @@ pure function fsum_kahan_1d_${s1}$_mask(a,mask) result(s) end function #:endfor -#:for k1, t1, s1 in R_KINDS_TYPES +#:for k1, t1, s1 in RC_KINDS_TYPES +pure function fprod_${s1}$(a,b) result(p) + ${t1}$, intent(in) :: a(:) + ${t1}$, intent(in) :: b(:) + ${t1}$ :: p + ${t1}$ :: abatch(chunk) + integer :: i, dr, rr + ! ----------------------------- + dr = size(a)/chunk + rr = size(a) - dr*chunk + + abatch = zero_${s1}$ + do i = 1, dr + abatch(1:chunk) = abatch(1:chunk) + a(chunk*i-chunk+1:chunk*i)*${cnjg(t1,'b(chunk*i-chunk+1:chunk*i)')}$ + end do + abatch(1:rr) = abatch(1:rr) + a(size(a)-rr+1:size(a))*${cnjg(t1,'b(size(a)-rr+1:size(a))')}$ + + p = zero_${s1}$ + do i = 1, chunk/2 + p = p + abatch(i)+abatch(chunk/2+i) + end do +end function + +pure function fprod_kahan_${s1}$(a,b) result(p) + ${t1}$, intent(in) :: a(:) + ${t1}$, intent(in) :: b(:) + ${t1}$ :: p + ${t1}$ :: pbatch(chunk) + ${t1}$ :: cbatch(chunk) + integer :: i, dr, rr + ! ----------------------------- + dr = size(a)/(chunk) + rr = size(a) - dr*chunk + pbatch = zero_${s1}$ + cbatch = zero_${s1}$ + do i = 1, dr + call vkahan( a(chunk*i-chunk+1:chunk*i)*${cnjg(t1,'b(chunk*i-chunk+1:chunk*i)')}$ , pbatch(1:chunk) , cbatch(1:chunk) ) + end do + call vkahan( a(size(a)-rr+1:size(a))*${cnjg(t1,'b(size(a)-rr+1:size(a))')}$ , pbatch(1:rr) , cbatch(1:rr) ) + + p = zero_${s1}$ + do i = 1,chunk + call vkahan( pbatch(i) , p , cbatch(i) ) + end do +end function + +#:endfor + +#:for k1, t1, s1 in RC_KINDS_TYPES elemental subroutine vkahan_${s1}$(a,s,c) ${t1}$, intent(in) :: a ${t1}$, intent(inout) :: s diff --git a/test/CMakeLists.txt b/test/CMakeLists.txt index 4d83548db..566c0e6d8 100644 --- a/test/CMakeLists.txt +++ b/test/CMakeLists.txt @@ -17,6 +17,7 @@ add_subdirectory(constants) add_subdirectory(hash_functions) add_subdirectory(hash_functions_perf) add_subdirectory(hashmaps) +add_subdirectory(intrinsics) add_subdirectory(io) add_subdirectory(linalg) add_subdirectory(logger) diff --git a/test/intrinsics/CMakeLists.txt b/test/intrinsics/CMakeLists.txt new file mode 100644 index 000000000..cc5e9fb87 --- /dev/null +++ b/test/intrinsics/CMakeLists.txt @@ -0,0 +1,7 @@ +set( + fppFiles + "test_intrinsics.fypp" +) +fypp_f90("${fyppFlags}" "${fppFiles}" outFiles) + +ADDTEST(intrinsics) diff --git a/test/intrinsics/test_intrinsics.fypp b/test/intrinsics/test_intrinsics.fypp new file mode 100644 index 000000000..d7f579b52 --- /dev/null +++ b/test/intrinsics/test_intrinsics.fypp @@ -0,0 +1,94 @@ +#:include "common.fypp" +#:set R_KINDS_TYPES = list(zip(REAL_KINDS, REAL_TYPES, REAL_SUFFIX)) +#:set C_KINDS_TYPES = list(zip(CMPLX_KINDS, CMPLX_TYPES, CMPLX_SUFFIX)) +#:set RC_KINDS_TYPES = R_KINDS_TYPES + C_KINDS_TYPES + +module test_intrinsics + use testdrive, only : new_unittest, unittest_type, error_type, check, skip_test + use stdlib_kinds, only: sp, dp, xdp, qp, int8, int16, int32, int64 + use stdlib_intrinsics + use stdlib_math, only: swap + implicit none + +contains + +!> Collect all exported unit tests +subroutine collect_suite(testsuite) + !> Collection of tests + type(unittest_type), allocatable, intent(out) :: testsuite(:) + + testsuite = [ & + new_unittest('sum', test_sum) & + !new_unittest('dot_product', test_dot_product) & + ] +end subroutine + +subroutine test_sum(error) + !> Error handling + type(error_type), allocatable, intent(out) :: error + + !> Internal parameters and variables + integer, parameter :: n = 1e3, ncalc = 3, niter = 100 + real(sp) :: u + integer :: iter, i, j + !==================================================================================== + #:for k1, t1, s1 in R_KINDS_TYPES + block + ${t1}$, allocatable :: x(:) + ${t1}$, parameter :: total_sum = 4*atan(1._${k1}$), tolerance = epsilon(1._${k1}$)*100 + ${t1}$ :: xsum(ncalc), meanval(ncalc), err(ncalc) + + allocate(x(n)) + do i = 1, n + x(i) = 8*atan(1._${k1}$)*(real(i,kind=${k1}$)-0.5_${k1}$)/real(n,kind=${k1}$)**2 + end do + ! scramble array + do i = 1, n + call random_number(u) + j = 1 + FLOOR(n*u) + call swap( x(i), x(j) ) + end do + + err(:) = 0._${k1}$ + do iter = 1, niter + xsum(1) = sum(x) + xsum(2) = fsum_kahan(x) + xsum(3) = fsum(x) + err(1:ncalc) = err(1:ncalc) + abs(1._${k1}$-xsum(1:ncalc)/total_sum) + end do + err(1:ncalc) = err(1:ncalc) / niter + print *, err + call check(error, all(err(:) 0) then + write(error_unit, '(i0, 1x, a)') stat, "test(s) failed!" + error stop + end if +end program \ No newline at end of file From 243ea6fd68f3a643d7c6cd14f0008e87fd370242 Mon Sep 17 00:00:00 2001 From: jalvesz Date: Thu, 26 Dec 2024 23:20:38 +0100 Subject: [PATCH 03/18] add complex sum test --- test/intrinsics/test_intrinsics.fypp | 46 ++++++++++++++++++++++++---- 1 file changed, 40 insertions(+), 6 deletions(-) diff --git a/test/intrinsics/test_intrinsics.fypp b/test/intrinsics/test_intrinsics.fypp index d7f579b52..accc3a822 100644 --- a/test/intrinsics/test_intrinsics.fypp +++ b/test/intrinsics/test_intrinsics.fypp @@ -45,20 +45,54 @@ subroutine test_sum(error) ! scramble array do i = 1, n call random_number(u) - j = 1 + FLOOR(n*u) + j = 1 + floor(n*u) call swap( x(i), x(j) ) end do err(:) = 0._${k1}$ do iter = 1, niter - xsum(1) = sum(x) - xsum(2) = fsum_kahan(x) - xsum(3) = fsum(x) + xsum(1) = sum(x) ! compiler intrinsic + xsum(2) = fsum_kahan(x) ! chunked Kahan summation + xsum(3) = fsum(x) ! chunked summation err(1:ncalc) = err(1:ncalc) + abs(1._${k1}$-xsum(1:ncalc)/total_sum) end do err(1:ncalc) = err(1:ncalc) / niter - print *, err - call check(error, all(err(:) Date: Thu, 26 Dec 2024 23:36:05 +0100 Subject: [PATCH 04/18] test masked sum --- test/intrinsics/test_intrinsics.fypp | 34 ++++++++++++++++++++++++++++ 1 file changed, 34 insertions(+) diff --git a/test/intrinsics/test_intrinsics.fypp b/test/intrinsics/test_intrinsics.fypp index accc3a822..790250a68 100644 --- a/test/intrinsics/test_intrinsics.fypp +++ b/test/intrinsics/test_intrinsics.fypp @@ -37,16 +37,21 @@ subroutine test_sum(error) ${t1}$, allocatable :: x(:) ${t1}$, parameter :: total_sum = 4*atan(1._${k1}$), tolerance = epsilon(1._${k1}$)*100 ${t1}$ :: xsum(ncalc), meanval(ncalc), err(ncalc) + logical, allocatable :: mask(:), nmask(:) allocate(x(n)) do i = 1, n x(i) = 8*atan(1._${k1}$)*(real(i,kind=${k1}$)-0.5_${k1}$)/real(n,kind=${k1}$)**2 end do + allocate(mask(n),source=.false.); mask(1:n:2) = .true. + allocate(nmask(n)); nmask = .not.mask ! scramble array do i = 1, n call random_number(u) j = 1 + floor(n*u) call swap( x(i), x(j) ) + call swap( mask(i), mask(j) ) + call swap( nmask(i), nmask(j) ) end do err(:) = 0._${k1}$ @@ -60,6 +65,18 @@ subroutine test_sum(error) call check(error, all(err(:) Date: Thu, 26 Dec 2024 23:49:06 +0100 Subject: [PATCH 05/18] add dot_product tests --- test/intrinsics/test_intrinsics.fypp | 80 +++++++++++++++++++++++++++- 1 file changed, 78 insertions(+), 2 deletions(-) diff --git a/test/intrinsics/test_intrinsics.fypp b/test/intrinsics/test_intrinsics.fypp index 790250a68..f83f60759 100644 --- a/test/intrinsics/test_intrinsics.fypp +++ b/test/intrinsics/test_intrinsics.fypp @@ -18,8 +18,8 @@ subroutine collect_suite(testsuite) type(unittest_type), allocatable, intent(out) :: testsuite(:) testsuite = [ & - new_unittest('sum', test_sum) & - !new_unittest('dot_product', test_dot_product) & + new_unittest('sum', test_sum), & + new_unittest('dot_product', test_dot_product) & ] end subroutine @@ -132,6 +132,82 @@ subroutine test_sum(error) #:endfor end subroutine + +subroutine test_dot_product(error) + !> Error handling + type(error_type), allocatable, intent(out) :: error + + !> Internal parameters and variables + integer, parameter :: n = 1e3, ncalc = 3, niter = 100 + real(sp) :: u + integer :: iter, i, j + !==================================================================================== + #:for k1, t1, s1 in R_KINDS_TYPES + block + ${t1}$, allocatable :: x(:) + ${t1}$, parameter :: total_sum = 4*atan(1._${k1}$), tolerance = epsilon(1._${k1}$)*100 + ${t1}$ :: xsum(ncalc), meanval(ncalc), err(ncalc) + + allocate(x(n)) + do i = 1, n + x(i) = sqrt( 8*atan(1._${k1}$)*(real(i,kind=${k1}$)-0.5_${k1}$)/real(n,kind=${k1}$)**2 ) + end do + ! scramble array + do i = 1, n + call random_number(u) + j = 1 + floor(n*u) + call swap( x(i), x(j) ) + end do + + err(:) = 0._${k1}$ + do iter = 1, niter + xsum(1) = dot_product(x,x) ! compiler intrinsic + xsum(2) = fprod_kahan(x,x) ! chunked Kahan summation + xsum(3) = fprod(x,x) ! chunked summation + err(1:ncalc) = err(1:ncalc) + abs(1._${k1}$-xsum(1:ncalc)/total_sum) + end do + err(1:ncalc) = err(1:ncalc) / niter + + call check(error, all(err(:) Date: Sun, 29 Dec 2024 22:13:47 +0100 Subject: [PATCH 06/18] start specs --- doc/specs/stdlib_intrinsics.md | 45 ++++++++++++++++++++++++++++++++++ 1 file changed, 45 insertions(+) create mode 100644 doc/specs/stdlib_intrinsics.md diff --git a/doc/specs/stdlib_intrinsics.md b/doc/specs/stdlib_intrinsics.md new file mode 100644 index 000000000..956d47ffc --- /dev/null +++ b/doc/specs/stdlib_intrinsics.md @@ -0,0 +1,45 @@ +--- +title: intrinsics +--- + +# The `stdlib_intrinsics` module + +[TOC] + +## Introduction + +The `stdlib_intrinsics` module provides replacements for some of the well known intrinsic functions found in Fortran compilers for which either a faster and/or more accurate implementation is found which has also proven of interest to the Fortran community. + + +### `fsum` function + +#### Description + +The `fsum` function can replace the intrinsic `sum` for 1D `real` or `complex` arrays. It follows a chunked implementation which maximaxes vectorization potential as well as reducing the round-off error. + +#### Syntax + +`res = ` [[stdlib_intrinsics(module):fsum(interface)]] ` (x [,mask] )` + +#### Status + +Experimental + +#### Class + +Pure function. + +#### Argument(s) + +`x`: 1D array of either `real` or `complex` type. This argument is `intent(in)`. +`mask`: 1D array of `logical` values. This argument is `intent(in)`. + +#### Output value or Result value + +The output is a scalar of `type` and `kind` same as to that of `x`. + +#### Example + +```fortran +{!example/math/example_intrinsics_sum.f90!} +``` From 75945f1234f1cba1d4a89ee5b870ad056bb536fc Mon Sep 17 00:00:00 2001 From: jalvesz Date: Thu, 2 Jan 2025 22:15:14 +0100 Subject: [PATCH 07/18] split into submodules --- src/CMakeLists.txt | 2 + src/stdlib_constants.fypp | 11 ++ src/stdlib_intrinsics.fypp | 213 +++++-------------------- src/stdlib_intrinsics_dot_product.fypp | 75 +++++++++ src/stdlib_intrinsics_sum.fypp | 118 ++++++++++++++ 5 files changed, 246 insertions(+), 173 deletions(-) create mode 100644 src/stdlib_intrinsics_dot_product.fypp create mode 100644 src/stdlib_intrinsics_sum.fypp diff --git a/src/CMakeLists.txt b/src/CMakeLists.txt index 9b52321f5..350db423a 100644 --- a/src/CMakeLists.txt +++ b/src/CMakeLists.txt @@ -16,6 +16,8 @@ set(fppFiles stdlib_hash_64bit_fnv.fypp stdlib_hash_64bit_pengy.fypp stdlib_hash_64bit_spookyv2.fypp + stdlib_intrinsics_dot_product.fypp + stdlib_intrinsics_sum.fypp stdlib_intrinsics.fypp stdlib_io.fypp stdlib_io_npy.fypp diff --git a/src/stdlib_constants.fypp b/src/stdlib_constants.fypp index e169dbb3b..61c7d6c90 100644 --- a/src/stdlib_constants.fypp +++ b/src/stdlib_constants.fypp @@ -1,5 +1,8 @@ #:include "common.fypp" #:set KINDS = REAL_KINDS +#:set R_KINDS_TYPES = list(zip(REAL_KINDS, REAL_TYPES, REAL_SUFFIX)) +#:set C_KINDS_TYPES = list(zip(CMPLX_KINDS, CMPLX_TYPES, CMPLX_SUFFIX)) + module stdlib_constants !! Constants !! ([Specification](../page/specs/stdlib_constants.html)) @@ -60,5 +63,13 @@ module stdlib_constants real(dp), parameter, public :: u = ATOMIC_MASS_CONSTANT%value !! Atomic mass constant ! Additional constants if needed + #:for k, t, s in R_KINDS_TYPES + ${t}$, parameter, public :: zero_${s}$ = 0._${k}$ + ${t}$, parameter, public :: one_${s}$ = 1._${k}$ + #:endfor + #:for k, t, s in C_KINDS_TYPES + ${t}$, parameter, public :: zero_${s}$ = (0._${k}$,0._${k}$) + ${t}$, parameter, public :: one_${s}$ = (1._${k}$,0._${k}$) + #:endfor end module stdlib_constants diff --git a/src/stdlib_intrinsics.fypp b/src/stdlib_intrinsics.fypp index dcbca0e70..0a4d3244a 100644 --- a/src/stdlib_intrinsics.fypp +++ b/src/stdlib_intrinsics.fypp @@ -3,14 +3,6 @@ #:set C_KINDS_TYPES = list(zip(CMPLX_KINDS, CMPLX_TYPES, CMPLX_SUFFIX)) #:set RC_KINDS_TYPES = R_KINDS_TYPES + C_KINDS_TYPES -#:def cnjg(type,expression) -#:if 'complex' in type -conjg(${expression}$) -#:else -${expression}$ -#:endif -#:enddef - ! This module is based on https://github.com/jalvesz/fast_math module stdlib_intrinsics !!Replacement for certain Fortran intrinsic functions offering either faster and/or more accurate implementations. @@ -21,30 +13,52 @@ module stdlib_intrinsics interface fsum #:for rk, rt, rs in RC_KINDS_TYPES - module procedure :: fsum_1d_${rs}$ - module procedure :: fsum_1d_${rs}$_mask + pure module function fsum_1d_${rs}$(a) result(s) + ${rt}$, intent(in) :: a(:) + ${rt}$ :: s + end function + pure module function fsum_1d_${rs}$_mask(a,mask) result(s) + ${rt}$, intent(in) :: a(:) + logical, intent(in) :: mask(:) + ${rt}$ :: s + end function #:endfor end interface public :: fsum interface fsum_kahan #:for rk, rt, rs in RC_KINDS_TYPES - module procedure :: fsum_kahan_1d_${rs}$ - module procedure :: fsum_kahan_1d_${rs}$_mask + pure module function fsum_kahan_1d_${rs}$(a) result(s) + ${rt}$, intent(in) :: a(:) + ${rt}$ :: s + end function + pure module function fsum_kahan_1d_${rs}$_mask(a,mask) result(s) + ${rt}$, intent(in) :: a(:) + logical, intent(in) :: mask(:) + ${rt}$ :: s + end function #:endfor end interface public :: fsum_kahan interface fprod #:for rk, rt, rs in RC_KINDS_TYPES - module procedure :: fprod_${rs}$ + pure module function fprod_${rs}$(a,b) result(p) + ${rt}$, intent(in) :: a(:) + ${rt}$, intent(in) :: b(:) + ${rt}$ :: p + end function #:endfor end interface public :: fprod interface fprod_kahan #:for rk, rt, rs in RC_KINDS_TYPES - module procedure :: fprod_kahan_${rs}$ + pure module function fprod_kahan_${rs}$(a,b) result(p) + ${rt}$, intent(in) :: a(:) + ${rt}$, intent(in) :: b(:) + ${rt}$ :: p + end function #:endfor end interface public :: fprod_kahan @@ -55,174 +69,27 @@ module stdlib_intrinsics module procedure :: vkahan_m_${rs}$ #:endfor end interface - - #:for k1, t1, s1 in (R_KINDS_TYPES) - ${t1}$, parameter :: zero_${s1}$ = 0._${k1}$ - #:endfor - #:for k1, t1, s1 in (C_KINDS_TYPES) - ${t1}$, parameter :: zero_${s1}$ = (0._${k1}$,0._${k1}$) - #:endfor - integer, parameter :: chunk = 64 + public :: vkahan contains -#:for k1, t1, s1 in RC_KINDS_TYPES -pure function fsum_1d_${s1}$(a) result(s) - ${t1}$, intent(in) :: a(:) - ${t1}$ :: s - ${t1}$ :: abatch(chunk) - integer :: i, dr, rr - ! ----------------------------- - dr = size(a)/chunk - rr = size(a) - dr*chunk - - abatch = zero_${s1}$ - do i = 1, dr - abatch(1:chunk) = abatch(1:chunk) + a(chunk*i-chunk+1:chunk*i) - end do - abatch(1:rr) = abatch(1:rr) + a(size(a)-rr+1:size(a)) - - s = zero_${s1}$ - do i = 1, chunk/2 - s = s + abatch(i)+abatch(chunk/2+i) - end do -end function - -pure function fsum_1d_${s1}$_mask(a,mask) result(s) - ${t1}$, intent(in) :: a(:) - logical, intent(in) :: mask(:) - ${t1}$ :: s - ${t1}$ :: abatch(chunk) - integer :: i, dr, rr - ! ----------------------------- - dr = size(a)/chunk - rr = size(a) - dr*chunk - - abatch = zero_${s1}$ - do i = 1, dr - abatch(1:chunk) = abatch(1:chunk) + merge( zero_${s1}$ , a(chunk*i-chunk+1:chunk*i) , mask(chunk*i-chunk+1:chunk*i) ) - end do - abatch(1:rr) = abatch(1:rr) + merge( zero_${s1}$ , a(size(a)-rr+1:size(a)) , mask(size(a)-rr+1:size(a)) ) - - s = zero_${s1}$ - do i = 1, chunk/2 - s = s + abatch(i)+abatch(chunk/2+i) - end do -end function -#:endfor - -#:for k1, t1, s1 in RC_KINDS_TYPES -pure function fsum_kahan_1d_${s1}$(a) result(s) - ${t1}$, intent(in) :: a(:) - ${t1}$ :: s - ${t1}$ :: sbatch(chunk) - ${t1}$ :: cbatch(chunk) - integer :: i, dr, rr - ! ----------------------------- - dr = size(a)/(chunk) - rr = size(a) - dr*chunk - sbatch = zero_${s1}$ - cbatch = zero_${s1}$ - do i = 1, dr - call vkahan( a(chunk*i-chunk+1:chunk*i) , sbatch(1:chunk) , cbatch(1:chunk) ) - end do - call vkahan( a(size(a)-rr+1:size(a)) , sbatch(1:rr) , cbatch(1:rr) ) - - s = zero_${s1}$ - do i = 1,chunk - call vkahan( sbatch(i) , s , cbatch(i) ) - end do -end function - -pure function fsum_kahan_1d_${s1}$_mask(a,mask) result(s) - ${t1}$, intent(in) :: a(:) - logical, intent(in) :: mask(:) - ${t1}$ :: s - ${t1}$ :: sbatch(chunk) - ${t1}$ :: cbatch(chunk) - integer :: i, dr, rr - ! ----------------------------- - dr = size(a)/(chunk) - rr = size(a) - dr*chunk - sbatch = zero_${s1}$ - cbatch = zero_${s1}$ - do i = 1, dr - call vkahan( a(chunk*i-chunk+1:chunk*i) , sbatch(1:chunk) , cbatch(1:chunk) , mask(chunk*i-chunk+1:chunk*i) ) - end do - call vkahan( a(size(a)-rr+1:size(a)) , sbatch(1:rr) , cbatch(1:rr) , mask(size(a)-rr+1:size(a)) ) - - s = zero_${s1}$ - do i = 1,chunk - call vkahan( sbatch(i) , s , cbatch(i) ) - end do -end function -#:endfor - -#:for k1, t1, s1 in RC_KINDS_TYPES -pure function fprod_${s1}$(a,b) result(p) - ${t1}$, intent(in) :: a(:) - ${t1}$, intent(in) :: b(:) - ${t1}$ :: p - ${t1}$ :: abatch(chunk) - integer :: i, dr, rr - ! ----------------------------- - dr = size(a)/chunk - rr = size(a) - dr*chunk - - abatch = zero_${s1}$ - do i = 1, dr - abatch(1:chunk) = abatch(1:chunk) + a(chunk*i-chunk+1:chunk*i)*${cnjg(t1,'b(chunk*i-chunk+1:chunk*i)')}$ - end do - abatch(1:rr) = abatch(1:rr) + a(size(a)-rr+1:size(a))*${cnjg(t1,'b(size(a)-rr+1:size(a))')}$ - - p = zero_${s1}$ - do i = 1, chunk/2 - p = p + abatch(i)+abatch(chunk/2+i) - end do -end function - -pure function fprod_kahan_${s1}$(a,b) result(p) - ${t1}$, intent(in) :: a(:) - ${t1}$, intent(in) :: b(:) - ${t1}$ :: p - ${t1}$ :: pbatch(chunk) - ${t1}$ :: cbatch(chunk) - integer :: i, dr, rr - ! ----------------------------- - dr = size(a)/(chunk) - rr = size(a) - dr*chunk - pbatch = zero_${s1}$ - cbatch = zero_${s1}$ - do i = 1, dr - call vkahan( a(chunk*i-chunk+1:chunk*i)*${cnjg(t1,'b(chunk*i-chunk+1:chunk*i)')}$ , pbatch(1:chunk) , cbatch(1:chunk) ) - end do - call vkahan( a(size(a)-rr+1:size(a))*${cnjg(t1,'b(size(a)-rr+1:size(a))')}$ , pbatch(1:rr) , cbatch(1:rr) ) - - p = zero_${s1}$ - do i = 1,chunk - call vkahan( pbatch(i) , p , cbatch(i) ) - end do -end function - -#:endfor - -#:for k1, t1, s1 in RC_KINDS_TYPES -elemental subroutine vkahan_${s1}$(a,s,c) - ${t1}$, intent(in) :: a - ${t1}$, intent(inout) :: s - ${t1}$, intent(inout) :: c - ${t1}$ :: t, y +#:for rk, rt, rs in RC_KINDS_TYPES +elemental subroutine vkahan_${rs}$(a,s,c) + ${rt}$, intent(in) :: a + ${rt}$, intent(inout) :: s + ${rt}$, intent(inout) :: c + ${rt}$ :: t, y y = a - c t = s + y c = (t - s) - y s = t end subroutine -elemental subroutine vkahan_m_${s1}$(a,s,c,m) - ${t1}$, intent(in) :: a - ${t1}$, intent(inout) :: s - ${t1}$, intent(inout) :: c +elemental subroutine vkahan_m_${rs}$(a,s,c,m) + ${rt}$, intent(in) :: a + ${rt}$, intent(inout) :: s + ${rt}$, intent(inout) :: c logical, intent(in) :: m - ${t1}$ :: t, y + ${rt}$ :: t, y y = a - c t = s + y c = (t - s) - y diff --git a/src/stdlib_intrinsics_dot_product.fypp b/src/stdlib_intrinsics_dot_product.fypp new file mode 100644 index 000000000..045d512b2 --- /dev/null +++ b/src/stdlib_intrinsics_dot_product.fypp @@ -0,0 +1,75 @@ +#:include "common.fypp" +#:set R_KINDS_TYPES = list(zip(REAL_KINDS, REAL_TYPES, REAL_SUFFIX)) +#:set C_KINDS_TYPES = list(zip(CMPLX_KINDS, CMPLX_TYPES, CMPLX_SUFFIX)) +#:set RC_KINDS_TYPES = R_KINDS_TYPES + C_KINDS_TYPES + +#:def cnjg(type,expression) +#:if 'complex' in type +conjg(${expression}$) +#:else +${expression}$ +#:endif +#:enddef + +! This module is based on https://github.com/jalvesz/fast_math +submodule(stdlib_intrinsics) stdlib_intrinsics_dot_product + !!Replacement for certain Fortran intrinsic functions offering either faster and/or more accurate implementations. + !! ([Specification](../page/specs/stdlib_intrinsics.html)) + use stdlib_kinds + use stdlib_constants + implicit none + + integer, parameter :: chunk = 64 + +contains + +#:for k1, t1, s1 in RC_KINDS_TYPES +pure module function fprod_${s1}$(a,b) result(p) + ${t1}$, intent(in) :: a(:) + ${t1}$, intent(in) :: b(:) + ${t1}$ :: p + ${t1}$ :: abatch(chunk) + integer :: i, dr, rr + ! ----------------------------- + dr = size(a)/chunk + rr = size(a) - dr*chunk + + abatch = zero_${s1}$ + do i = 1, dr + abatch(1:chunk) = abatch(1:chunk) + a(chunk*i-chunk+1:chunk*i)*${cnjg(t1,'b(chunk*i-chunk+1:chunk*i)')}$ + end do + abatch(1:rr) = abatch(1:rr) + a(size(a)-rr+1:size(a))*${cnjg(t1,'b(size(a)-rr+1:size(a))')}$ + + p = zero_${s1}$ + do i = 1, chunk/2 + p = p + abatch(i)+abatch(chunk/2+i) + end do +end function +#:endfor + +#:for k1, t1, s1 in RC_KINDS_TYPES +pure module function fprod_kahan_${s1}$(a,b) result(p) + ${t1}$, intent(in) :: a(:) + ${t1}$, intent(in) :: b(:) + ${t1}$ :: p + ${t1}$ :: pbatch(chunk) + ${t1}$ :: cbatch(chunk) + integer :: i, dr, rr + ! ----------------------------- + dr = size(a)/(chunk) + rr = size(a) - dr*chunk + pbatch = zero_${s1}$ + cbatch = zero_${s1}$ + do i = 1, dr + call vkahan( a(chunk*i-chunk+1:chunk*i)*${cnjg(t1,'b(chunk*i-chunk+1:chunk*i)')}$ , pbatch(1:chunk) , cbatch(1:chunk) ) + end do + call vkahan( a(size(a)-rr+1:size(a))*${cnjg(t1,'b(size(a)-rr+1:size(a))')}$ , pbatch(1:rr) , cbatch(1:rr) ) + + p = zero_${s1}$ + do i = 1,chunk + call vkahan( pbatch(i) , p , cbatch(i) ) + end do +end function +#:endfor + +end submodule stdlib_intrinsics_dot_product diff --git a/src/stdlib_intrinsics_sum.fypp b/src/stdlib_intrinsics_sum.fypp new file mode 100644 index 000000000..e189ca897 --- /dev/null +++ b/src/stdlib_intrinsics_sum.fypp @@ -0,0 +1,118 @@ +#:include "common.fypp" +#:set R_KINDS_TYPES = list(zip(REAL_KINDS, REAL_TYPES, REAL_SUFFIX)) +#:set C_KINDS_TYPES = list(zip(CMPLX_KINDS, CMPLX_TYPES, CMPLX_SUFFIX)) +#:set RC_KINDS_TYPES = R_KINDS_TYPES + C_KINDS_TYPES + +#:def cnjg(type,expression) +#:if 'complex' in type +conjg(${expression}$) +#:else +${expression}$ +#:endif +#:enddef + +! This module is based on https://github.com/jalvesz/fast_math +submodule(stdlib_intrinsics) stdlib_intrinsics_sum + !!Replacement for certain Fortran intrinsic functions offering either faster and/or more accurate implementations. + !! ([Specification](../page/specs/stdlib_intrinsics.html)) + use stdlib_kinds + use stdlib_constants + implicit none + + integer, parameter :: chunk = 64 + +contains + +#:for rk, rt, rs in RC_KINDS_TYPES +pure module function fsum_1d_${rs}$(a) result(s) + ${rt}$, intent(in) :: a(:) + ${rt}$ :: s + ${rt}$ :: abatch(chunk) + integer :: i, dr, rr + ! ----------------------------- + dr = size(a)/chunk + rr = size(a) - dr*chunk + + abatch = zero_${rs}$ + do i = 1, dr + abatch(1:chunk) = abatch(1:chunk) + a(chunk*i-chunk+1:chunk*i) + end do + abatch(1:rr) = abatch(1:rr) + a(size(a)-rr+1:size(a)) + + s = zero_${rs}$ + do i = 1, chunk/2 + s = s + abatch(i)+abatch(chunk/2+i) + end do +end function + +pure module function fsum_1d_${rs}$_mask(a,mask) result(s) + ${rt}$, intent(in) :: a(:) + logical, intent(in) :: mask(:) + ${rt}$ :: s + ${rt}$ :: abatch(chunk) + integer :: i, dr, rr + ! ----------------------------- + dr = size(a)/chunk + rr = size(a) - dr*chunk + + abatch = zero_${rs}$ + do i = 1, dr + abatch(1:chunk) = abatch(1:chunk) + merge( zero_${rs}$ , a(chunk*i-chunk+1:chunk*i) , mask(chunk*i-chunk+1:chunk*i) ) + end do + abatch(1:rr) = abatch(1:rr) + merge( zero_${rs}$ , a(size(a)-rr+1:size(a)) , mask(size(a)-rr+1:size(a)) ) + + s = zero_${rs}$ + do i = 1, chunk/2 + s = s + abatch(i)+abatch(chunk/2+i) + end do +end function +#:endfor + +#:for rk, rt, rs in RC_KINDS_TYPES +pure module function fsum_kahan_1d_${rs}$(a) result(s) + ${rt}$, intent(in) :: a(:) + ${rt}$ :: s + ${rt}$ :: sbatch(chunk) + ${rt}$ :: cbatch(chunk) + integer :: i, dr, rr + ! ----------------------------- + dr = size(a)/(chunk) + rr = size(a) - dr*chunk + sbatch = zero_${rs}$ + cbatch = zero_${rs}$ + do i = 1, dr + call vkahan( a(chunk*i-chunk+1:chunk*i) , sbatch(1:chunk) , cbatch(1:chunk) ) + end do + call vkahan( a(size(a)-rr+1:size(a)) , sbatch(1:rr) , cbatch(1:rr) ) + + s = zero_${rs}$ + do i = 1,chunk + call vkahan( sbatch(i) , s , cbatch(i) ) + end do +end function + +pure module function fsum_kahan_1d_${rs}$_mask(a,mask) result(s) + ${rt}$, intent(in) :: a(:) + logical, intent(in) :: mask(:) + ${rt}$ :: s + ${rt}$ :: sbatch(chunk) + ${rt}$ :: cbatch(chunk) + integer :: i, dr, rr + ! ----------------------------- + dr = size(a)/(chunk) + rr = size(a) - dr*chunk + sbatch = zero_${rs}$ + cbatch = zero_${rs}$ + do i = 1, dr + call vkahan( a(chunk*i-chunk+1:chunk*i) , sbatch(1:chunk) , cbatch(1:chunk) , mask(chunk*i-chunk+1:chunk*i) ) + end do + call vkahan( a(size(a)-rr+1:size(a)) , sbatch(1:rr) , cbatch(1:rr) , mask(size(a)-rr+1:size(a)) ) + + s = zero_${rs}$ + do i = 1,chunk + call vkahan( sbatch(i) , s , cbatch(i) ) + end do +end function +#:endfor + +end submodule stdlib_intrinsics_sum From d05903f0a7b6a2d010175c3bafe462f926291d12 Mon Sep 17 00:00:00 2001 From: jalvesz Date: Thu, 2 Jan 2025 22:58:15 +0100 Subject: [PATCH 08/18] specs and examples --- doc/specs/stdlib_intrinsics.md | 117 ++++++++++++++++++++- example/CMakeLists.txt | 1 + example/intrinsics/CMakeLists.txt | 2 + example/intrinsics/example_dot_product.f90 | 18 ++++ example/intrinsics/example_sum.f90 | 17 +++ 5 files changed, 153 insertions(+), 2 deletions(-) create mode 100644 example/intrinsics/CMakeLists.txt create mode 100644 example/intrinsics/example_dot_product.f90 create mode 100644 example/intrinsics/example_sum.f90 diff --git a/doc/specs/stdlib_intrinsics.md b/doc/specs/stdlib_intrinsics.md index 956d47ffc..099d6ccb4 100644 --- a/doc/specs/stdlib_intrinsics.md +++ b/doc/specs/stdlib_intrinsics.md @@ -15,7 +15,7 @@ The `stdlib_intrinsics` module provides replacements for some of the well known #### Description -The `fsum` function can replace the intrinsic `sum` for 1D `real` or `complex` arrays. It follows a chunked implementation which maximaxes vectorization potential as well as reducing the round-off error. +The `fsum` function can replace the intrinsic `sum` for 1D `real` or `complex` arrays. It follows a chunked implementation which maximizes vectorization potential as well as reducing the round-off error. This procedure is recommended when summing large arrays, for repetitive summation of smaller arrays consider the classical `sum`. #### Syntax @@ -32,7 +32,7 @@ Pure function. #### Argument(s) `x`: 1D array of either `real` or `complex` type. This argument is `intent(in)`. -`mask`: 1D array of `logical` values. This argument is `intent(in)`. +`mask` (optional): 1D array of `logical` values. This argument is `intent(in)`. #### Output value or Result value @@ -43,3 +43,116 @@ The output is a scalar of `type` and `kind` same as to that of `x`. ```fortran {!example/math/example_intrinsics_sum.f90!} ``` + + +### `fsum_kahan` function + +#### Description + +The `fsum_kahan` function can replace the intrinsic `sum` for 1D `real` or `complex` arrays. It follows a chunked implementation which maximizes vectorization potential, complemented by an `elemental` kernel based on the [kahan summation](https://en.wikipedia.org/wiki/Kahan_summation_algorithm) strategy to reduce the round-off error: + +```fortran +elemental subroutine vkahan_(a,s,c) + type(), intent(in) :: a + type(), intent(inout) :: s + type(), intent(inout) :: c + type() :: t, y + y = a - c + t = s + y + c = (t - s) - y + s = t +end subroutine +``` + +#### Syntax + +`res = ` [[stdlib_intrinsics(module):fsum_kahan(interface)]] ` (x [,mask] )` + +#### Status + +Experimental + +#### Class + +Pure function. + +#### Argument(s) + +`x`: 1D array of either `real` or `complex` type. This argument is `intent(in)`. +`mask` (optional): 1D array of `logical` values. This argument is `intent(in)`. + +#### Output value or Result value + +The output is a scalar of `type` and `kind` same as to that of `x`. + +#### Example + +```fortran +{!example/math/example_intrinsics_sum.f90!} +``` + + +### `fprod` function + +#### Description + +The `fprod` function can replace the intrinsic `dot_product` for 1D `real` or `complex` arrays. It follows a chunked implementation which maximizes vectorization potential as well as reducing the round-off error. This procedure is recommended when crunching large arrays, for repetitive products of smaller arrays consider the classical `dot_product`. + +#### Syntax + +`res = ` [[stdlib_intrinsics(module):fprod(interface)]] ` (x, y)` + +#### Status + +Experimental + +#### Class + +Pure function. + +#### Argument(s) + +`x`: 1D array of either `real` or `complex` type. This argument is `intent(in)`. +`y`: 1D array of the same type and kind as `x`. This argument is `intent(in)`. + +#### Output value or Result value + +The output is a scalar of `type` and `kind` same as to that of `x` and `y`. + +#### Example + +```fortran +{!example/math/example_intrinsics_dot_duct.f90!} +``` + + +### `fprod_kahan` function + +#### Description + +The `fprod_kahan` function can replace the intrinsic `dot_product` for 1D `real` or `complex` arrays. It follows a chunked implementation which maximizes vectorization potential , complemented by the same `elemental` kernel based on the [kahan summation](https://en.wikipedia.org/wiki/Kahan_summation_algorithm) used for `fsum` to reduce the round-off error. + +#### Syntax + +`res = ` [[stdlib_intrinsics(module):fprod_kahan(interface)]] ` (x, y)` + +#### Status + +Experimental + +#### Class + +Pure function. + +#### Argument(s) + +`x`: 1D array of either `real` or `complex` type. This argument is `intent(in)`. +`y`: 1D array of the same type and kind as `x`. This argument is `intent(in)`. + +#### Output value or Result value + +The output is a scalar of `type` and `kind` same as to that of `x` and `y`. + +```fortran +{!example/math/example_intrinsics_dot_duct.f90!} +``` \ No newline at end of file diff --git a/example/CMakeLists.txt b/example/CMakeLists.txt index cbef7f075..968a048b0 100644 --- a/example/CMakeLists.txt +++ b/example/CMakeLists.txt @@ -13,6 +13,7 @@ add_subdirectory(constants) add_subdirectory(error) add_subdirectory(hashmaps) add_subdirectory(hash_procedures) +add_subdirectory(intrinsics) add_subdirectory(io) add_subdirectory(linalg) add_subdirectory(logger) diff --git a/example/intrinsics/CMakeLists.txt b/example/intrinsics/CMakeLists.txt new file mode 100644 index 000000000..1645ba8a1 --- /dev/null +++ b/example/intrinsics/CMakeLists.txt @@ -0,0 +1,2 @@ +ADD_EXAMPLE(sum) +ADD_EXAMPLE(dot_product) \ No newline at end of file diff --git a/example/intrinsics/example_dot_product.f90 b/example/intrinsics/example_dot_product.f90 new file mode 100644 index 000000000..c0d111d83 --- /dev/null +++ b/example/intrinsics/example_dot_product.f90 @@ -0,0 +1,18 @@ +program example_dot_product + use stdlib_kinds, only: sp + use stdlib_intrinsics, only: fprod, fprod_kahan + implicit none + + real(sp), allocatable :: x(:), y(:) + real(sp) :: total_prod(3) + + allocate( x(1000), y(1000) ) + call random_number(x) + call random_number(y) + + total_prod(1) = dot_product(x,y) !> compiler intrinsic + total_prod(2) = fprod(x,y) !> chunked summation over inner product + total_prod(3) = fprod_kahan(x,y) !> chunked kahan summation over inner product + print *, total_prod(1:3) + +end program example_dot_product \ No newline at end of file diff --git a/example/intrinsics/example_sum.f90 b/example/intrinsics/example_sum.f90 new file mode 100644 index 000000000..70c59d954 --- /dev/null +++ b/example/intrinsics/example_sum.f90 @@ -0,0 +1,17 @@ +program example_sum + use stdlib_kinds, only: sp + use stdlib_intrinsics, only: fsum, fsum_kahan + implicit none + + real(sp), allocatable :: x(:) + real(sp) :: total_sum(3) + + allocate( x(1000) ) + call random_number(x) + + total_sum(1) = sum(x) !> compiler intrinsic + total_sum(2) = fsum(x) !> chunked summation + total_sum(3) = fsum_kahan(x)!> chunked kahan summation + print *, total_sum(1:3) + +end program example_sum \ No newline at end of file From 7c6e8a436f61c39f5b19b17a8ce8abc31b294fef Mon Sep 17 00:00:00 2001 From: jalvesz Date: Fri, 3 Jan 2025 21:21:16 +0100 Subject: [PATCH 09/18] fix specs --- doc/specs/stdlib_intrinsics.md | 20 ++++++-------------- 1 file changed, 6 insertions(+), 14 deletions(-) diff --git a/doc/specs/stdlib_intrinsics.md b/doc/specs/stdlib_intrinsics.md index 099d6ccb4..2a7fa3ea4 100644 --- a/doc/specs/stdlib_intrinsics.md +++ b/doc/specs/stdlib_intrinsics.md @@ -32,18 +32,13 @@ Pure function. #### Argument(s) `x`: 1D array of either `real` or `complex` type. This argument is `intent(in)`. + `mask` (optional): 1D array of `logical` values. This argument is `intent(in)`. #### Output value or Result value The output is a scalar of `type` and `kind` same as to that of `x`. -#### Example - -```fortran -{!example/math/example_intrinsics_sum.f90!} -``` - ### `fsum_kahan` function @@ -79,6 +74,7 @@ Pure function. #### Argument(s) `x`: 1D array of either `real` or `complex` type. This argument is `intent(in)`. + `mask` (optional): 1D array of `logical` values. This argument is `intent(in)`. #### Output value or Result value @@ -88,7 +84,7 @@ The output is a scalar of `type` and `kind` same as to that of `x`. #### Example ```fortran -{!example/math/example_intrinsics_sum.f90!} +{!example/intrinsics/example_sum.f90!} ``` @@ -113,18 +109,13 @@ Pure function. #### Argument(s) `x`: 1D array of either `real` or `complex` type. This argument is `intent(in)`. + `y`: 1D array of the same type and kind as `x`. This argument is `intent(in)`. #### Output value or Result value The output is a scalar of `type` and `kind` same as to that of `x` and `y`. -#### Example - -```fortran -{!example/math/example_intrinsics_dot_duct.f90!} -``` - ### `fprod_kahan` function @@ -147,6 +138,7 @@ Pure function. #### Argument(s) `x`: 1D array of either `real` or `complex` type. This argument is `intent(in)`. + `y`: 1D array of the same type and kind as `x`. This argument is `intent(in)`. #### Output value or Result value @@ -154,5 +146,5 @@ Pure function. The output is a scalar of `type` and `kind` same as to that of `x` and `y`. ```fortran -{!example/math/example_intrinsics_dot_duct.f90!} +{!example/intrinsics/example_dot_product.f90!} ``` \ No newline at end of file From 7cea1fd43e27da4a23799166043182bf7463f076 Mon Sep 17 00:00:00 2001 From: jalvesz Date: Fri, 3 Jan 2025 22:11:54 +0100 Subject: [PATCH 10/18] fix test: complex initialization --- test/intrinsics/test_intrinsics.fypp | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/test/intrinsics/test_intrinsics.fypp b/test/intrinsics/test_intrinsics.fypp index f83f60759..8969d3afb 100644 --- a/test/intrinsics/test_intrinsics.fypp +++ b/test/intrinsics/test_intrinsics.fypp @@ -90,7 +90,7 @@ subroutine test_sum(error) allocate(x(n)) do i = 1, n - x(i) = complex(& + x(i) = cmplx(& 8*atan(1._${k1}$)*(real(i,kind=${k1}$)-0.5_${k1}$)/real(2*n,kind=${k1}$)**2,& 8*atan(1._${k1}$)*(real(i+n,kind=${k1}$)-0.5_${k1}$)/real(2*n,kind=${k1}$)**2) end do @@ -182,7 +182,7 @@ subroutine test_dot_product(error) allocate(x(n)) do i = 1, n - x(i) = complex(& + x(i) = cmplx(& sqrt(8*atan(1._${k1}$)*(real(i,kind=${k1}$)-0.5_${k1}$)/real(2*n,kind=${k1}$)**2),& sqrt(8*atan(1._${k1}$)*(real(i+n,kind=${k1}$)-0.5_${k1}$)/real(2*n,kind=${k1}$)**2)) end do From eaffa4a8aa1c6c5374cb6b5bda6ee3b0d048f6d8 Mon Sep 17 00:00:00 2001 From: jalvesz Date: Sat, 4 Jan 2025 19:26:12 +0100 Subject: [PATCH 11/18] fix test: complex assignment caused accuracy loss --- test/intrinsics/test_intrinsics.fypp | 99 +++++++++++----------------- 1 file changed, 37 insertions(+), 62 deletions(-) diff --git a/test/intrinsics/test_intrinsics.fypp b/test/intrinsics/test_intrinsics.fypp index 8969d3afb..3f1a3a98f 100644 --- a/test/intrinsics/test_intrinsics.fypp +++ b/test/intrinsics/test_intrinsics.fypp @@ -28,7 +28,7 @@ subroutine test_sum(error) type(error_type), allocatable, intent(out) :: error !> Internal parameters and variables - integer, parameter :: n = 1e3, ncalc = 3, niter = 100 + integer, parameter :: n = 1e3, ncalc = 3 real(sp) :: u integer :: iter, i, j !==================================================================================== @@ -36,7 +36,7 @@ subroutine test_sum(error) block ${t1}$, allocatable :: x(:) ${t1}$, parameter :: total_sum = 4*atan(1._${k1}$), tolerance = epsilon(1._${k1}$)*100 - ${t1}$ :: xsum(ncalc), meanval(ncalc), err(ncalc) + ${t1}$ :: xsum(ncalc), err(ncalc) logical, allocatable :: mask(:), nmask(:) allocate(x(n)) @@ -54,26 +54,18 @@ subroutine test_sum(error) call swap( nmask(i), nmask(j) ) end do - err(:) = 0._${k1}$ - do iter = 1, niter - xsum(1) = sum(x) ! compiler intrinsic - xsum(2) = fsum_kahan(x) ! chunked Kahan summation - xsum(3) = fsum(x) ! chunked summation - err(1:ncalc) = err(1:ncalc) + abs(1._${k1}$-xsum(1:ncalc)/total_sum) - end do - err(1:ncalc) = err(1:ncalc) / niter + xsum(1) = sum(x) ! compiler intrinsic + xsum(2) = fsum_kahan(x) ! chunked Kahan summation + xsum(3) = fsum(x) ! chunked summation + err(1:ncalc) = abs(1._${k1}$-xsum(1:ncalc)/total_sum) call check(error, all(err(:) Internal parameters and variables - integer, parameter :: n = 1e3, ncalc = 3, niter = 100 + integer, parameter :: n = 1e3, ncalc = 3 real(sp) :: u integer :: iter, i, j !==================================================================================== @@ -146,11 +131,11 @@ subroutine test_dot_product(error) block ${t1}$, allocatable :: x(:) ${t1}$, parameter :: total_sum = 4*atan(1._${k1}$), tolerance = epsilon(1._${k1}$)*100 - ${t1}$ :: xsum(ncalc), meanval(ncalc), err(ncalc) + ${t1}$ :: xsum(ncalc), err(ncalc) allocate(x(n)) do i = 1, n - x(i) = sqrt( 8*atan(1._${k1}$)*(real(i,kind=${k1}$)-0.5_${k1}$)/real(n,kind=${k1}$)**2 ) + x(i) = 2*sqrt( 2*atan(1._${k1}$)*(real(i,kind=${k1}$)-0.5_${k1}$) )/n end do ! scramble array do i = 1, n @@ -159,14 +144,10 @@ subroutine test_dot_product(error) call swap( x(i), x(j) ) end do - err(:) = 0._${k1}$ - do iter = 1, niter - xsum(1) = dot_product(x,x) ! compiler intrinsic - xsum(2) = fprod_kahan(x,x) ! chunked Kahan summation - xsum(3) = fprod(x,x) ! chunked summation - err(1:ncalc) = err(1:ncalc) + abs(1._${k1}$-xsum(1:ncalc)/total_sum) - end do - err(1:ncalc) = err(1:ncalc) / niter + xsum(1) = dot_product(x,x) ! compiler intrinsic + xsum(2) = fprod_kahan(x,x) ! chunked Kahan summation + xsum(3) = fprod(x,x) ! chunked summation + err(1:ncalc) = abs(1._${k1}$-xsum(1:ncalc)/total_sum) call check(error, all(err(:) Date: Sun, 5 Jan 2025 21:38:35 +0100 Subject: [PATCH 12/18] extend fsum support for ndarrays --- src/stdlib_intrinsics.fypp | 14 ++++++ src/stdlib_intrinsics_sum.fypp | 73 ++++++++++++++++++++++++++++ test/intrinsics/test_intrinsics.fypp | 20 +++++++- 3 files changed, 106 insertions(+), 1 deletion(-) diff --git a/src/stdlib_intrinsics.fypp b/src/stdlib_intrinsics.fypp index 0a4d3244a..1b9604cee 100644 --- a/src/stdlib_intrinsics.fypp +++ b/src/stdlib_intrinsics.fypp @@ -2,6 +2,7 @@ #:set R_KINDS_TYPES = list(zip(REAL_KINDS, REAL_TYPES, REAL_SUFFIX)) #:set C_KINDS_TYPES = list(zip(CMPLX_KINDS, CMPLX_TYPES, CMPLX_SUFFIX)) #:set RC_KINDS_TYPES = R_KINDS_TYPES + C_KINDS_TYPES +#:set RANKS = range(1, MAXRANK + 1) ! This module is based on https://github.com/jalvesz/fast_math module stdlib_intrinsics @@ -22,6 +23,19 @@ module stdlib_intrinsics logical, intent(in) :: mask(:) ${rt}$ :: s end function + #:for rank in RANKS + pure module function fsum_${rank}$d_${rs}$( x, mask ) result( s ) + ${rt}$, intent(in) :: x${ranksuffix(rank)}$ + logical, intent(in), optional :: mask${ranksuffix(rank)}$ + ${rt}$ :: s + end function + pure module function fsum_${rank}$d_dim_${rs}$( x , dim, mask ) result( s ) + ${rt}$, intent(in) :: x${ranksuffix(rank)}$ + integer, intent(in):: dim + logical, intent(in), optional :: mask${ranksuffix(rank)}$ + ${rt}$ :: s${reduced_shape('x', rank, 'dim')}$ + end function + #:endfor #:endfor end interface public :: fsum diff --git a/src/stdlib_intrinsics_sum.fypp b/src/stdlib_intrinsics_sum.fypp index e189ca897..f1f831be4 100644 --- a/src/stdlib_intrinsics_sum.fypp +++ b/src/stdlib_intrinsics_sum.fypp @@ -2,6 +2,7 @@ #:set R_KINDS_TYPES = list(zip(REAL_KINDS, REAL_TYPES, REAL_SUFFIX)) #:set C_KINDS_TYPES = list(zip(CMPLX_KINDS, CMPLX_TYPES, CMPLX_SUFFIX)) #:set RC_KINDS_TYPES = R_KINDS_TYPES + C_KINDS_TYPES +#:set RANKS = range(1, MAXRANK + 1) #:def cnjg(type,expression) #:if 'complex' in type @@ -66,6 +67,78 @@ pure module function fsum_1d_${rs}$_mask(a,mask) result(s) s = s + abatch(i)+abatch(chunk/2+i) end do end function + +#:for rank in RANKS +pure module function fsum_${rank}$d_${rs}$( x , mask ) result( s ) + ${rt}$, intent(in) :: x${ranksuffix(rank)}$ + logical, intent(in), optional :: mask${ranksuffix(rank)}$ + ${rt}$ :: s + if(.not.present(mask)) then + s = sum_recast(x,size(x)) + else + s = sum_recast_mask(x,mask,size(x)) + end if +contains + pure ${rt}$ function sum_recast(b,n) + integer, intent(in) :: n + ${rt}$, intent(in) :: b(n) + sum_recast = fsum(b) + end function + pure ${rt}$ function sum_recast_mask(b,m,n) + integer, intent(in) :: n + ${rt}$, intent(in) :: b(n) + logical, intent(in) :: m(n) + sum_recast_mask = fsum(b,m) + end function +end function + +pure module function fsum_${rank}$d_dim_${rs}$( x , dim, mask ) result( s ) + ${rt}$, intent(in) :: x${ranksuffix(rank)}$ + integer, intent(in):: dim + logical, intent(in), optional :: mask${ranksuffix(rank)}$ + ${rt}$ :: s${reduced_shape('x', rank, 'dim')}$ + integer :: j + + if(.not.present(mask)) then + if(dim<${rank}$)then + do j = 1, size(x,dim=${rank}$) + #:if rank == 2 + s${select_subarray(rank-1, [(rank-1, 'j')])}$ = fsum( x${select_subarray(rank, [(rank, 'j')])}$ ) + #:else + s${select_subarray(rank-1, [(rank-1, 'j')])}$ = fsum( x${select_subarray(rank, [(rank, 'j')])}$, dim=dim ) + #:endif + end do + else + do j = 1, size(x,dim=1) + #:if rank == 2 + s${select_subarray(rank-1, [(1, 'j')])}$ = fsum( x${select_subarray(rank, [(1, 'j')])}$ ) + #:else + s${select_subarray(rank-1, [(1, 'j')])}$ = fsum( x${select_subarray(rank, [(1, 'j')])}$, dim=${rank-1}$ ) + #:endif + end do + end if + else + if(dim<${rank}$)then + do j = 1, size(x,dim=${rank}$) + #:if rank == 2 + s${select_subarray(rank-1, [(rank-1, 'j')])}$ = fsum( x${select_subarray(rank, [(rank, 'j')])}$, mask=mask${select_subarray(rank, [(rank, 'j')])}$ ) + #:else + s${select_subarray(rank-1, [(rank-1, 'j')])}$ = fsum( x${select_subarray(rank, [(rank, 'j')])}$, dim=dim, mask=mask${select_subarray(rank, [(rank, 'j')])}$ ) + #:endif + end do + else + do j = 1, size(x,dim=1) + #:if rank == 2 + s${select_subarray(rank-1, [(1, 'j')])}$ = fsum( x${select_subarray(rank, [(1, 'j')])}$, mask=mask${select_subarray(rank, [(1, 'j')])}$ ) + #:else + s${select_subarray(rank-1, [(1, 'j')])}$ = fsum( x${select_subarray(rank, [(1, 'j')])}$, dim=${rank-1}$, mask=mask${select_subarray(rank, [(1, 'j')])}$ ) + #:endif + end do + end if + end if + +end function +#:endfor #:endfor #:for rk, rt, rs in RC_KINDS_TYPES diff --git a/test/intrinsics/test_intrinsics.fypp b/test/intrinsics/test_intrinsics.fypp index 3f1a3a98f..ce8ec5c0b 100644 --- a/test/intrinsics/test_intrinsics.fypp +++ b/test/intrinsics/test_intrinsics.fypp @@ -110,12 +110,30 @@ subroutine test_sum(error) xsum(3) = fsum(x,mask)+fsum(x,nmask) ! chunked summation err(1:ncalc) = abs(1._${k1}$-(xsum(1:ncalc)%re)/total_sum) - call check(error, all(err(:) sum all elements + call check(error, abs( sum(x) - fsum(x) ) sum over specific rank dim + do i = 1, rank(x) + call check(error, norm2( sum(x,dim=i) - fsum(x,dim=i) ) Date: Sun, 5 Jan 2025 21:39:44 +0100 Subject: [PATCH 13/18] remove unnecessary definition --- src/stdlib_intrinsics_sum.fypp | 8 -------- 1 file changed, 8 deletions(-) diff --git a/src/stdlib_intrinsics_sum.fypp b/src/stdlib_intrinsics_sum.fypp index f1f831be4..7c6cbc522 100644 --- a/src/stdlib_intrinsics_sum.fypp +++ b/src/stdlib_intrinsics_sum.fypp @@ -4,14 +4,6 @@ #:set RC_KINDS_TYPES = R_KINDS_TYPES + C_KINDS_TYPES #:set RANKS = range(1, MAXRANK + 1) -#:def cnjg(type,expression) -#:if 'complex' in type -conjg(${expression}$) -#:else -${expression}$ -#:endif -#:enddef - ! This module is based on https://github.com/jalvesz/fast_math submodule(stdlib_intrinsics) stdlib_intrinsics_sum !!Replacement for certain Fortran intrinsic functions offering either faster and/or more accurate implementations. From 47396acf367f01ba06e93664e0346f769436ef33 Mon Sep 17 00:00:00 2001 From: jalvesz Date: Sun, 5 Jan 2025 21:56:50 +0100 Subject: [PATCH 14/18] update specs, change name of kahan kernel --- doc/specs/stdlib_intrinsics.md | 14 +++++++++----- src/stdlib_intrinsics.fypp | 14 +++++++------- src/stdlib_intrinsics_dot_product.fypp | 6 +++--- src/stdlib_intrinsics_sum.fypp | 14 +++++++------- 4 files changed, 26 insertions(+), 22 deletions(-) diff --git a/doc/specs/stdlib_intrinsics.md b/doc/specs/stdlib_intrinsics.md index 2a7fa3ea4..c9baf619c 100644 --- a/doc/specs/stdlib_intrinsics.md +++ b/doc/specs/stdlib_intrinsics.md @@ -15,12 +15,14 @@ The `stdlib_intrinsics` module provides replacements for some of the well known #### Description -The `fsum` function can replace the intrinsic `sum` for 1D `real` or `complex` arrays. It follows a chunked implementation which maximizes vectorization potential as well as reducing the round-off error. This procedure is recommended when summing large arrays, for repetitive summation of smaller arrays consider the classical `sum`. +The `fsum` function can replace the intrinsic `sum` for `real` or `complex` arrays. It follows a chunked implementation which maximizes vectorization potential as well as reducing the round-off error. This procedure is recommended when summing large arrays, for repetitive summation of smaller arrays consider the classical `sum`. #### Syntax `res = ` [[stdlib_intrinsics(module):fsum(interface)]] ` (x [,mask] )` +`res = ` [[stdlib_intrinsics(module):fsum(interface)]] ` (x, dim [,mask] )` + #### Status Experimental @@ -31,13 +33,15 @@ Pure function. #### Argument(s) -`x`: 1D array of either `real` or `complex` type. This argument is `intent(in)`. +`x`: N-D array of either `real` or `complex` type. This argument is `intent(in)`. -`mask` (optional): 1D array of `logical` values. This argument is `intent(in)`. +`dim` (optional): scalar of type `integer` with a value in the range from 1 to n, where n equals the rank of `x`. + +`mask` (optional): N-D array of `logical` values, with the same shape as `x`. This argument is `intent(in)`. #### Output value or Result value -The output is a scalar of `type` and `kind` same as to that of `x`. +If `dim` is absent, the output is a scalar of the same `type` and `kind` as to that of `x`. Otherwise, an array of rank n-1, where n equals the rank of `x`, and a shape similar to that of `x` with dimension `dim` dropped is returned. ### `fsum_kahan` function @@ -47,7 +51,7 @@ The output is a scalar of `type` and `kind` same as to that of `x`. The `fsum_kahan` function can replace the intrinsic `sum` for 1D `real` or `complex` arrays. It follows a chunked implementation which maximizes vectorization potential, complemented by an `elemental` kernel based on the [kahan summation](https://en.wikipedia.org/wiki/Kahan_summation_algorithm) strategy to reduce the round-off error: ```fortran -elemental subroutine vkahan_(a,s,c) +elemental subroutine kahan_kernel_(a,s,c) type(), intent(in) :: a type(), intent(inout) :: s type(), intent(inout) :: c diff --git a/src/stdlib_intrinsics.fypp b/src/stdlib_intrinsics.fypp index 1b9604cee..f8da7a876 100644 --- a/src/stdlib_intrinsics.fypp +++ b/src/stdlib_intrinsics.fypp @@ -2,7 +2,7 @@ #:set R_KINDS_TYPES = list(zip(REAL_KINDS, REAL_TYPES, REAL_SUFFIX)) #:set C_KINDS_TYPES = list(zip(CMPLX_KINDS, CMPLX_TYPES, CMPLX_SUFFIX)) #:set RC_KINDS_TYPES = R_KINDS_TYPES + C_KINDS_TYPES -#:set RANKS = range(1, MAXRANK + 1) +#:set RANKS = range(2, MAXRANK + 1) ! This module is based on https://github.com/jalvesz/fast_math module stdlib_intrinsics @@ -77,18 +77,18 @@ module stdlib_intrinsics end interface public :: fprod_kahan - interface vkahan + interface kahan_kernel #:for rk, rt, rs in RC_KINDS_TYPES - module procedure :: vkahan_${rs}$ - module procedure :: vkahan_m_${rs}$ + module procedure :: kahan_kernel_${rs}$ + module procedure :: kahan_kernel_m_${rs}$ #:endfor end interface - public :: vkahan + public :: kahan_kernel contains #:for rk, rt, rs in RC_KINDS_TYPES -elemental subroutine vkahan_${rs}$(a,s,c) +elemental subroutine kahan_kernel_${rs}$(a,s,c) ${rt}$, intent(in) :: a ${rt}$, intent(inout) :: s ${rt}$, intent(inout) :: c @@ -98,7 +98,7 @@ elemental subroutine vkahan_${rs}$(a,s,c) c = (t - s) - y s = t end subroutine -elemental subroutine vkahan_m_${rs}$(a,s,c,m) +elemental subroutine kahan_kernel_m_${rs}$(a,s,c,m) ${rt}$, intent(in) :: a ${rt}$, intent(inout) :: s ${rt}$, intent(inout) :: c diff --git a/src/stdlib_intrinsics_dot_product.fypp b/src/stdlib_intrinsics_dot_product.fypp index 045d512b2..73e4fffcc 100644 --- a/src/stdlib_intrinsics_dot_product.fypp +++ b/src/stdlib_intrinsics_dot_product.fypp @@ -61,13 +61,13 @@ pure module function fprod_kahan_${s1}$(a,b) result(p) pbatch = zero_${s1}$ cbatch = zero_${s1}$ do i = 1, dr - call vkahan( a(chunk*i-chunk+1:chunk*i)*${cnjg(t1,'b(chunk*i-chunk+1:chunk*i)')}$ , pbatch(1:chunk) , cbatch(1:chunk) ) + call kahan_kernel( a(chunk*i-chunk+1:chunk*i)*${cnjg(t1,'b(chunk*i-chunk+1:chunk*i)')}$ , pbatch(1:chunk) , cbatch(1:chunk) ) end do - call vkahan( a(size(a)-rr+1:size(a))*${cnjg(t1,'b(size(a)-rr+1:size(a))')}$ , pbatch(1:rr) , cbatch(1:rr) ) + call kahan_kernel( a(size(a)-rr+1:size(a))*${cnjg(t1,'b(size(a)-rr+1:size(a))')}$ , pbatch(1:rr) , cbatch(1:rr) ) p = zero_${s1}$ do i = 1,chunk - call vkahan( pbatch(i) , p , cbatch(i) ) + call kahan_kernel( pbatch(i) , p , cbatch(i) ) end do end function #:endfor diff --git a/src/stdlib_intrinsics_sum.fypp b/src/stdlib_intrinsics_sum.fypp index 7c6cbc522..5fb012384 100644 --- a/src/stdlib_intrinsics_sum.fypp +++ b/src/stdlib_intrinsics_sum.fypp @@ -2,7 +2,7 @@ #:set R_KINDS_TYPES = list(zip(REAL_KINDS, REAL_TYPES, REAL_SUFFIX)) #:set C_KINDS_TYPES = list(zip(CMPLX_KINDS, CMPLX_TYPES, CMPLX_SUFFIX)) #:set RC_KINDS_TYPES = R_KINDS_TYPES + C_KINDS_TYPES -#:set RANKS = range(1, MAXRANK + 1) +#:set RANKS = range(2, MAXRANK + 1) ! This module is based on https://github.com/jalvesz/fast_math submodule(stdlib_intrinsics) stdlib_intrinsics_sum @@ -146,13 +146,13 @@ pure module function fsum_kahan_1d_${rs}$(a) result(s) sbatch = zero_${rs}$ cbatch = zero_${rs}$ do i = 1, dr - call vkahan( a(chunk*i-chunk+1:chunk*i) , sbatch(1:chunk) , cbatch(1:chunk) ) + call kahan_kernel( a(chunk*i-chunk+1:chunk*i) , sbatch(1:chunk) , cbatch(1:chunk) ) end do - call vkahan( a(size(a)-rr+1:size(a)) , sbatch(1:rr) , cbatch(1:rr) ) + call kahan_kernel( a(size(a)-rr+1:size(a)) , sbatch(1:rr) , cbatch(1:rr) ) s = zero_${rs}$ do i = 1,chunk - call vkahan( sbatch(i) , s , cbatch(i) ) + call kahan_kernel( sbatch(i) , s , cbatch(i) ) end do end function @@ -169,13 +169,13 @@ pure module function fsum_kahan_1d_${rs}$_mask(a,mask) result(s) sbatch = zero_${rs}$ cbatch = zero_${rs}$ do i = 1, dr - call vkahan( a(chunk*i-chunk+1:chunk*i) , sbatch(1:chunk) , cbatch(1:chunk) , mask(chunk*i-chunk+1:chunk*i) ) + call kahan_kernel( a(chunk*i-chunk+1:chunk*i) , sbatch(1:chunk) , cbatch(1:chunk) , mask(chunk*i-chunk+1:chunk*i) ) end do - call vkahan( a(size(a)-rr+1:size(a)) , sbatch(1:rr) , cbatch(1:rr) , mask(size(a)-rr+1:size(a)) ) + call kahan_kernel( a(size(a)-rr+1:size(a)) , sbatch(1:rr) , cbatch(1:rr) , mask(size(a)-rr+1:size(a)) ) s = zero_${rs}$ do i = 1,chunk - call vkahan( sbatch(i) , s , cbatch(i) ) + call kahan_kernel( sbatch(i) , s , cbatch(i) ) end do end function #:endfor From ecb705070fbef96fd3cce20312b6743fe29f39d3 Mon Sep 17 00:00:00 2001 From: jalvesz Date: Tue, 7 Jan 2025 20:06:09 +0100 Subject: [PATCH 15/18] small reorganization --- src/stdlib_intrinsics_sum.fypp | 96 ++++++++++++++-------------- test/intrinsics/test_intrinsics.fypp | 3 +- 2 files changed, 51 insertions(+), 48 deletions(-) diff --git a/src/stdlib_intrinsics_sum.fypp b/src/stdlib_intrinsics_sum.fypp index 5fb012384..65ab2f574 100644 --- a/src/stdlib_intrinsics_sum.fypp +++ b/src/stdlib_intrinsics_sum.fypp @@ -16,6 +16,7 @@ submodule(stdlib_intrinsics) stdlib_intrinsics_sum contains +!================= 1D Base implementations ============ #:for rk, rt, rs in RC_KINDS_TYPES pure module function fsum_1d_${rs}$(a) result(s) ${rt}$, intent(in) :: a(:) @@ -60,6 +61,54 @@ pure module function fsum_1d_${rs}$_mask(a,mask) result(s) end do end function +pure module function fsum_kahan_1d_${rs}$(a) result(s) + ${rt}$, intent(in) :: a(:) + ${rt}$ :: s + ${rt}$ :: sbatch(chunk) + ${rt}$ :: cbatch(chunk) + integer :: i, dr, rr + ! ----------------------------- + dr = size(a)/(chunk) + rr = size(a) - dr*chunk + sbatch = zero_${rs}$ + cbatch = zero_${rs}$ + do i = 1, dr + call kahan_kernel( a(chunk*i-chunk+1:chunk*i) , sbatch(1:chunk) , cbatch(1:chunk) ) + end do + call kahan_kernel( a(size(a)-rr+1:size(a)) , sbatch(1:rr) , cbatch(1:rr) ) + + s = zero_${rs}$ + do i = 1,chunk + call kahan_kernel( sbatch(i) , s , cbatch(i) ) + end do +end function + +pure module function fsum_kahan_1d_${rs}$_mask(a,mask) result(s) + ${rt}$, intent(in) :: a(:) + logical, intent(in) :: mask(:) + ${rt}$ :: s + ${rt}$ :: sbatch(chunk) + ${rt}$ :: cbatch(chunk) + integer :: i, dr, rr + ! ----------------------------- + dr = size(a)/(chunk) + rr = size(a) - dr*chunk + sbatch = zero_${rs}$ + cbatch = zero_${rs}$ + do i = 1, dr + call kahan_kernel( a(chunk*i-chunk+1:chunk*i) , sbatch(1:chunk) , cbatch(1:chunk) , mask(chunk*i-chunk+1:chunk*i) ) + end do + call kahan_kernel( a(size(a)-rr+1:size(a)) , sbatch(1:rr) , cbatch(1:rr) , mask(size(a)-rr+1:size(a)) ) + + s = zero_${rs}$ + do i = 1,chunk + call kahan_kernel( sbatch(i) , s , cbatch(i) ) + end do +end function +#:endfor + +!================= N-D implementations ============ +#:for rk, rt, rs in RC_KINDS_TYPES #:for rank in RANKS pure module function fsum_${rank}$d_${rs}$( x , mask ) result( s ) ${rt}$, intent(in) :: x${ranksuffix(rank)}$ @@ -133,51 +182,4 @@ end function #:endfor #:endfor -#:for rk, rt, rs in RC_KINDS_TYPES -pure module function fsum_kahan_1d_${rs}$(a) result(s) - ${rt}$, intent(in) :: a(:) - ${rt}$ :: s - ${rt}$ :: sbatch(chunk) - ${rt}$ :: cbatch(chunk) - integer :: i, dr, rr - ! ----------------------------- - dr = size(a)/(chunk) - rr = size(a) - dr*chunk - sbatch = zero_${rs}$ - cbatch = zero_${rs}$ - do i = 1, dr - call kahan_kernel( a(chunk*i-chunk+1:chunk*i) , sbatch(1:chunk) , cbatch(1:chunk) ) - end do - call kahan_kernel( a(size(a)-rr+1:size(a)) , sbatch(1:rr) , cbatch(1:rr) ) - - s = zero_${rs}$ - do i = 1,chunk - call kahan_kernel( sbatch(i) , s , cbatch(i) ) - end do -end function - -pure module function fsum_kahan_1d_${rs}$_mask(a,mask) result(s) - ${rt}$, intent(in) :: a(:) - logical, intent(in) :: mask(:) - ${rt}$ :: s - ${rt}$ :: sbatch(chunk) - ${rt}$ :: cbatch(chunk) - integer :: i, dr, rr - ! ----------------------------- - dr = size(a)/(chunk) - rr = size(a) - dr*chunk - sbatch = zero_${rs}$ - cbatch = zero_${rs}$ - do i = 1, dr - call kahan_kernel( a(chunk*i-chunk+1:chunk*i) , sbatch(1:chunk) , cbatch(1:chunk) , mask(chunk*i-chunk+1:chunk*i) ) - end do - call kahan_kernel( a(size(a)-rr+1:size(a)) , sbatch(1:rr) , cbatch(1:rr) , mask(size(a)-rr+1:size(a)) ) - - s = zero_${rs}$ - do i = 1,chunk - call kahan_kernel( sbatch(i) , s , cbatch(i) ) - end do -end function -#:endfor - end submodule stdlib_intrinsics_sum diff --git a/test/intrinsics/test_intrinsics.fypp b/test/intrinsics/test_intrinsics.fypp index ce8ec5c0b..19b6e4358 100644 --- a/test/intrinsics/test_intrinsics.fypp +++ b/test/intrinsics/test_intrinsics.fypp @@ -129,7 +129,8 @@ subroutine test_sum(error) !> sum over specific rank dim do i = 1, rank(x) - call check(error, norm2( sum(x,dim=i) - fsum(x,dim=i) ) Date: Sat, 11 Jan 2025 23:53:11 +0100 Subject: [PATCH 16/18] change names to stdlib_* --- doc/specs/stdlib_intrinsics.md | 26 +++++++++--------- example/intrinsics/example_dot_product.f90 | 6 ++-- example/intrinsics/example_sum.f90 | 6 ++-- src/stdlib_intrinsics.fypp | 32 +++++++++++----------- src/stdlib_intrinsics_dot_product.fypp | 4 +-- src/stdlib_intrinsics_sum.fypp | 32 +++++++++++----------- test/intrinsics/test_intrinsics.fypp | 30 ++++++++++---------- 7 files changed, 68 insertions(+), 68 deletions(-) diff --git a/doc/specs/stdlib_intrinsics.md b/doc/specs/stdlib_intrinsics.md index c9baf619c..023775f02 100644 --- a/doc/specs/stdlib_intrinsics.md +++ b/doc/specs/stdlib_intrinsics.md @@ -11,17 +11,17 @@ title: intrinsics The `stdlib_intrinsics` module provides replacements for some of the well known intrinsic functions found in Fortran compilers for which either a faster and/or more accurate implementation is found which has also proven of interest to the Fortran community. -### `fsum` function +### `stdlib_sum` function #### Description -The `fsum` function can replace the intrinsic `sum` for `real` or `complex` arrays. It follows a chunked implementation which maximizes vectorization potential as well as reducing the round-off error. This procedure is recommended when summing large arrays, for repetitive summation of smaller arrays consider the classical `sum`. +The `stdlib_sum` function can replace the intrinsic `sum` for `real` or `complex` arrays. It follows a chunked implementation which maximizes vectorization potential as well as reducing the round-off error. This procedure is recommended when summing large arrays, for repetitive summation of smaller arrays consider the classical `sum`. #### Syntax -`res = ` [[stdlib_intrinsics(module):fsum(interface)]] ` (x [,mask] )` +`res = ` [[stdlib_intrinsics(module):stdlib_sum(interface)]] ` (x [,mask] )` -`res = ` [[stdlib_intrinsics(module):fsum(interface)]] ` (x, dim [,mask] )` +`res = ` [[stdlib_intrinsics(module):stdlib_sum(interface)]] ` (x, dim [,mask] )` #### Status @@ -44,11 +44,11 @@ Pure function. If `dim` is absent, the output is a scalar of the same `type` and `kind` as to that of `x`. Otherwise, an array of rank n-1, where n equals the rank of `x`, and a shape similar to that of `x` with dimension `dim` dropped is returned. -### `fsum_kahan` function +### `stdlib_sum_kahan` function #### Description -The `fsum_kahan` function can replace the intrinsic `sum` for 1D `real` or `complex` arrays. It follows a chunked implementation which maximizes vectorization potential, complemented by an `elemental` kernel based on the [kahan summation](https://en.wikipedia.org/wiki/Kahan_summation_algorithm) strategy to reduce the round-off error: +The `stdlib_sum_kahan` function can replace the intrinsic `sum` for 1D `real` or `complex` arrays. It follows a chunked implementation which maximizes vectorization potential, complemented by an `elemental` kernel based on the [kahan summation](https://en.wikipedia.org/wiki/Kahan_summation_algorithm) strategy to reduce the round-off error: ```fortran elemental subroutine kahan_kernel_(a,s,c) @@ -65,7 +65,7 @@ end subroutine #### Syntax -`res = ` [[stdlib_intrinsics(module):fsum_kahan(interface)]] ` (x [,mask] )` +`res = ` [[stdlib_intrinsics(module):stdlib_sum_kahan(interface)]] ` (x [,mask] )` #### Status @@ -92,15 +92,15 @@ The output is a scalar of `type` and `kind` same as to that of `x`. ``` -### `fprod` function +### `stdlib_dot_product` function #### Description -The `fprod` function can replace the intrinsic `dot_product` for 1D `real` or `complex` arrays. It follows a chunked implementation which maximizes vectorization potential as well as reducing the round-off error. This procedure is recommended when crunching large arrays, for repetitive products of smaller arrays consider the classical `dot_product`. +The `stdlib_dot_product` function can replace the intrinsic `dot_product` for 1D `real` or `complex` arrays. It follows a chunked implementation which maximizes vectorization potential as well as reducing the round-off error. This procedure is recommended when crunching large arrays, for repetitive products of smaller arrays consider the classical `dot_product`. #### Syntax -`res = ` [[stdlib_intrinsics(module):fprod(interface)]] ` (x, y)` +`res = ` [[stdlib_intrinsics(module):stdlib_dot_product(interface)]] ` (x, y)` #### Status @@ -121,15 +121,15 @@ Pure function. The output is a scalar of `type` and `kind` same as to that of `x` and `y`. -### `fprod_kahan` function +### `stdlib_dot_product_kahan` function #### Description -The `fprod_kahan` function can replace the intrinsic `dot_product` for 1D `real` or `complex` arrays. It follows a chunked implementation which maximizes vectorization potential , complemented by the same `elemental` kernel based on the [kahan summation](https://en.wikipedia.org/wiki/Kahan_summation_algorithm) used for `fsum` to reduce the round-off error. +The `stdlib_dot_product_kahan` function can replace the intrinsic `dot_product` for 1D `real` or `complex` arrays. It follows a chunked implementation which maximizes vectorization potential , complemented by the same `elemental` kernel based on the [kahan summation](https://en.wikipedia.org/wiki/Kahan_summation_algorithm) used for `stdlib_sum` to reduce the round-off error. #### Syntax -`res = ` [[stdlib_intrinsics(module):fprod_kahan(interface)]] ` (x, y)` +`res = ` [[stdlib_intrinsics(module):stdlib_dot_product_kahan(interface)]] ` (x, y)` #### Status diff --git a/example/intrinsics/example_dot_product.f90 b/example/intrinsics/example_dot_product.f90 index c0d111d83..fb6e40610 100644 --- a/example/intrinsics/example_dot_product.f90 +++ b/example/intrinsics/example_dot_product.f90 @@ -1,6 +1,6 @@ program example_dot_product use stdlib_kinds, only: sp - use stdlib_intrinsics, only: fprod, fprod_kahan + use stdlib_intrinsics, only: stdlib_dot_product, stdlib_dot_product_kahan implicit none real(sp), allocatable :: x(:), y(:) @@ -11,8 +11,8 @@ program example_dot_product call random_number(y) total_prod(1) = dot_product(x,y) !> compiler intrinsic - total_prod(2) = fprod(x,y) !> chunked summation over inner product - total_prod(3) = fprod_kahan(x,y) !> chunked kahan summation over inner product + total_prod(2) = stdlib_dot_product(x,y) !> chunked summation over inner product + total_prod(3) = stdlib_dot_product_kahan(x,y) !> chunked kahan summation over inner product print *, total_prod(1:3) end program example_dot_product \ No newline at end of file diff --git a/example/intrinsics/example_sum.f90 b/example/intrinsics/example_sum.f90 index 70c59d954..0aec4835c 100644 --- a/example/intrinsics/example_sum.f90 +++ b/example/intrinsics/example_sum.f90 @@ -1,6 +1,6 @@ program example_sum use stdlib_kinds, only: sp - use stdlib_intrinsics, only: fsum, fsum_kahan + use stdlib_intrinsics, only: stdlib_sum, stdlib_sum_kahan implicit none real(sp), allocatable :: x(:) @@ -10,8 +10,8 @@ program example_sum call random_number(x) total_sum(1) = sum(x) !> compiler intrinsic - total_sum(2) = fsum(x) !> chunked summation - total_sum(3) = fsum_kahan(x)!> chunked kahan summation + total_sum(2) = stdlib_sum(x) !> chunked summation + total_sum(3) = stdlib_sum_kahan(x)!> chunked kahan summation print *, total_sum(1:3) end program example_sum \ No newline at end of file diff --git a/src/stdlib_intrinsics.fypp b/src/stdlib_intrinsics.fypp index f8da7a876..bf20c0fa3 100644 --- a/src/stdlib_intrinsics.fypp +++ b/src/stdlib_intrinsics.fypp @@ -12,24 +12,24 @@ module stdlib_intrinsics implicit none private - interface fsum + interface stdlib_sum #:for rk, rt, rs in RC_KINDS_TYPES - pure module function fsum_1d_${rs}$(a) result(s) + pure module function stdlib_sum_1d_${rs}$(a) result(s) ${rt}$, intent(in) :: a(:) ${rt}$ :: s end function - pure module function fsum_1d_${rs}$_mask(a,mask) result(s) + pure module function stdlib_sum_1d_${rs}$_mask(a,mask) result(s) ${rt}$, intent(in) :: a(:) logical, intent(in) :: mask(:) ${rt}$ :: s end function #:for rank in RANKS - pure module function fsum_${rank}$d_${rs}$( x, mask ) result( s ) + pure module function stdlib_sum_${rank}$d_${rs}$( x, mask ) result( s ) ${rt}$, intent(in) :: x${ranksuffix(rank)}$ logical, intent(in), optional :: mask${ranksuffix(rank)}$ ${rt}$ :: s end function - pure module function fsum_${rank}$d_dim_${rs}$( x , dim, mask ) result( s ) + pure module function stdlib_sum_${rank}$d_dim_${rs}$( x , dim, mask ) result( s ) ${rt}$, intent(in) :: x${ranksuffix(rank)}$ integer, intent(in):: dim logical, intent(in), optional :: mask${ranksuffix(rank)}$ @@ -38,44 +38,44 @@ module stdlib_intrinsics #:endfor #:endfor end interface - public :: fsum + public :: stdlib_sum - interface fsum_kahan + interface stdlib_sum_kahan #:for rk, rt, rs in RC_KINDS_TYPES - pure module function fsum_kahan_1d_${rs}$(a) result(s) + pure module function stdlib_sum_kahan_1d_${rs}$(a) result(s) ${rt}$, intent(in) :: a(:) ${rt}$ :: s end function - pure module function fsum_kahan_1d_${rs}$_mask(a,mask) result(s) + pure module function stdlib_sum_kahan_1d_${rs}$_mask(a,mask) result(s) ${rt}$, intent(in) :: a(:) logical, intent(in) :: mask(:) ${rt}$ :: s end function #:endfor end interface - public :: fsum_kahan + public :: stdlib_sum_kahan - interface fprod + interface stdlib_dot_product #:for rk, rt, rs in RC_KINDS_TYPES - pure module function fprod_${rs}$(a,b) result(p) + pure module function stdlib_dot_product_${rs}$(a,b) result(p) ${rt}$, intent(in) :: a(:) ${rt}$, intent(in) :: b(:) ${rt}$ :: p end function #:endfor end interface - public :: fprod + public :: stdlib_dot_product - interface fprod_kahan + interface stdlib_dot_product_kahan #:for rk, rt, rs in RC_KINDS_TYPES - pure module function fprod_kahan_${rs}$(a,b) result(p) + pure module function stdlib_dot_product_kahan_${rs}$(a,b) result(p) ${rt}$, intent(in) :: a(:) ${rt}$, intent(in) :: b(:) ${rt}$ :: p end function #:endfor end interface - public :: fprod_kahan + public :: stdlib_dot_product_kahan interface kahan_kernel #:for rk, rt, rs in RC_KINDS_TYPES diff --git a/src/stdlib_intrinsics_dot_product.fypp b/src/stdlib_intrinsics_dot_product.fypp index 73e4fffcc..551756a9c 100644 --- a/src/stdlib_intrinsics_dot_product.fypp +++ b/src/stdlib_intrinsics_dot_product.fypp @@ -24,7 +24,7 @@ submodule(stdlib_intrinsics) stdlib_intrinsics_dot_product contains #:for k1, t1, s1 in RC_KINDS_TYPES -pure module function fprod_${s1}$(a,b) result(p) +pure module function stdlib_dot_product_${s1}$(a,b) result(p) ${t1}$, intent(in) :: a(:) ${t1}$, intent(in) :: b(:) ${t1}$ :: p @@ -48,7 +48,7 @@ end function #:endfor #:for k1, t1, s1 in RC_KINDS_TYPES -pure module function fprod_kahan_${s1}$(a,b) result(p) +pure module function stdlib_dot_product_kahan_${s1}$(a,b) result(p) ${t1}$, intent(in) :: a(:) ${t1}$, intent(in) :: b(:) ${t1}$ :: p diff --git a/src/stdlib_intrinsics_sum.fypp b/src/stdlib_intrinsics_sum.fypp index 65ab2f574..f6b771cf9 100644 --- a/src/stdlib_intrinsics_sum.fypp +++ b/src/stdlib_intrinsics_sum.fypp @@ -18,7 +18,7 @@ contains !================= 1D Base implementations ============ #:for rk, rt, rs in RC_KINDS_TYPES -pure module function fsum_1d_${rs}$(a) result(s) +pure module function stdlib_sum_1d_${rs}$(a) result(s) ${rt}$, intent(in) :: a(:) ${rt}$ :: s ${rt}$ :: abatch(chunk) @@ -39,7 +39,7 @@ pure module function fsum_1d_${rs}$(a) result(s) end do end function -pure module function fsum_1d_${rs}$_mask(a,mask) result(s) +pure module function stdlib_sum_1d_${rs}$_mask(a,mask) result(s) ${rt}$, intent(in) :: a(:) logical, intent(in) :: mask(:) ${rt}$ :: s @@ -61,7 +61,7 @@ pure module function fsum_1d_${rs}$_mask(a,mask) result(s) end do end function -pure module function fsum_kahan_1d_${rs}$(a) result(s) +pure module function stdlib_sum_kahan_1d_${rs}$(a) result(s) ${rt}$, intent(in) :: a(:) ${rt}$ :: s ${rt}$ :: sbatch(chunk) @@ -83,7 +83,7 @@ pure module function fsum_kahan_1d_${rs}$(a) result(s) end do end function -pure module function fsum_kahan_1d_${rs}$_mask(a,mask) result(s) +pure module function stdlib_sum_kahan_1d_${rs}$_mask(a,mask) result(s) ${rt}$, intent(in) :: a(:) logical, intent(in) :: mask(:) ${rt}$ :: s @@ -110,7 +110,7 @@ end function !================= N-D implementations ============ #:for rk, rt, rs in RC_KINDS_TYPES #:for rank in RANKS -pure module function fsum_${rank}$d_${rs}$( x , mask ) result( s ) +pure module function stdlib_sum_${rank}$d_${rs}$( x , mask ) result( s ) ${rt}$, intent(in) :: x${ranksuffix(rank)}$ logical, intent(in), optional :: mask${ranksuffix(rank)}$ ${rt}$ :: s @@ -123,17 +123,17 @@ contains pure ${rt}$ function sum_recast(b,n) integer, intent(in) :: n ${rt}$, intent(in) :: b(n) - sum_recast = fsum(b) + sum_recast = stdlib_sum(b) end function pure ${rt}$ function sum_recast_mask(b,m,n) integer, intent(in) :: n ${rt}$, intent(in) :: b(n) logical, intent(in) :: m(n) - sum_recast_mask = fsum(b,m) + sum_recast_mask = stdlib_sum(b,m) end function end function -pure module function fsum_${rank}$d_dim_${rs}$( x , dim, mask ) result( s ) +pure module function stdlib_sum_${rank}$d_dim_${rs}$( x , dim, mask ) result( s ) ${rt}$, intent(in) :: x${ranksuffix(rank)}$ integer, intent(in):: dim logical, intent(in), optional :: mask${ranksuffix(rank)}$ @@ -144,17 +144,17 @@ pure module function fsum_${rank}$d_dim_${rs}$( x , dim, mask ) result( s ) if(dim<${rank}$)then do j = 1, size(x,dim=${rank}$) #:if rank == 2 - s${select_subarray(rank-1, [(rank-1, 'j')])}$ = fsum( x${select_subarray(rank, [(rank, 'j')])}$ ) + s${select_subarray(rank-1, [(rank-1, 'j')])}$ = stdlib_sum( x${select_subarray(rank, [(rank, 'j')])}$ ) #:else - s${select_subarray(rank-1, [(rank-1, 'j')])}$ = fsum( x${select_subarray(rank, [(rank, 'j')])}$, dim=dim ) + s${select_subarray(rank-1, [(rank-1, 'j')])}$ = stdlib_sum( x${select_subarray(rank, [(rank, 'j')])}$, dim=dim ) #:endif end do else do j = 1, size(x,dim=1) #:if rank == 2 - s${select_subarray(rank-1, [(1, 'j')])}$ = fsum( x${select_subarray(rank, [(1, 'j')])}$ ) + s${select_subarray(rank-1, [(1, 'j')])}$ = stdlib_sum( x${select_subarray(rank, [(1, 'j')])}$ ) #:else - s${select_subarray(rank-1, [(1, 'j')])}$ = fsum( x${select_subarray(rank, [(1, 'j')])}$, dim=${rank-1}$ ) + s${select_subarray(rank-1, [(1, 'j')])}$ = stdlib_sum( x${select_subarray(rank, [(1, 'j')])}$, dim=${rank-1}$ ) #:endif end do end if @@ -162,17 +162,17 @@ pure module function fsum_${rank}$d_dim_${rs}$( x , dim, mask ) result( s ) if(dim<${rank}$)then do j = 1, size(x,dim=${rank}$) #:if rank == 2 - s${select_subarray(rank-1, [(rank-1, 'j')])}$ = fsum( x${select_subarray(rank, [(rank, 'j')])}$, mask=mask${select_subarray(rank, [(rank, 'j')])}$ ) + s${select_subarray(rank-1, [(rank-1, 'j')])}$ = stdlib_sum( x${select_subarray(rank, [(rank, 'j')])}$, mask=mask${select_subarray(rank, [(rank, 'j')])}$ ) #:else - s${select_subarray(rank-1, [(rank-1, 'j')])}$ = fsum( x${select_subarray(rank, [(rank, 'j')])}$, dim=dim, mask=mask${select_subarray(rank, [(rank, 'j')])}$ ) + s${select_subarray(rank-1, [(rank-1, 'j')])}$ = stdlib_sum( x${select_subarray(rank, [(rank, 'j')])}$, dim=dim, mask=mask${select_subarray(rank, [(rank, 'j')])}$ ) #:endif end do else do j = 1, size(x,dim=1) #:if rank == 2 - s${select_subarray(rank-1, [(1, 'j')])}$ = fsum( x${select_subarray(rank, [(1, 'j')])}$, mask=mask${select_subarray(rank, [(1, 'j')])}$ ) + s${select_subarray(rank-1, [(1, 'j')])}$ = stdlib_sum( x${select_subarray(rank, [(1, 'j')])}$, mask=mask${select_subarray(rank, [(1, 'j')])}$ ) #:else - s${select_subarray(rank-1, [(1, 'j')])}$ = fsum( x${select_subarray(rank, [(1, 'j')])}$, dim=${rank-1}$, mask=mask${select_subarray(rank, [(1, 'j')])}$ ) + s${select_subarray(rank-1, [(1, 'j')])}$ = stdlib_sum( x${select_subarray(rank, [(1, 'j')])}$, dim=${rank-1}$, mask=mask${select_subarray(rank, [(1, 'j')])}$ ) #:endif end do end if diff --git a/test/intrinsics/test_intrinsics.fypp b/test/intrinsics/test_intrinsics.fypp index 19b6e4358..a8a14b701 100644 --- a/test/intrinsics/test_intrinsics.fypp +++ b/test/intrinsics/test_intrinsics.fypp @@ -55,16 +55,16 @@ subroutine test_sum(error) end do xsum(1) = sum(x) ! compiler intrinsic - xsum(2) = fsum_kahan(x) ! chunked Kahan summation - xsum(3) = fsum(x) ! chunked summation + xsum(2) = stdlib_sum_kahan(x) ! chunked Kahan summation + xsum(3) = stdlib_sum(x) ! chunked summation err(1:ncalc) = abs(1._${k1}$-xsum(1:ncalc)/total_sum) call check(error, all(err(:) sum all elements - call check(error, abs( sum(x) - fsum(x) ) sum over specific rank dim do i = 1, rank(x) - call check(error, norm2( sum(x,dim=i) - fsum(x,dim=i) ) Date: Sun, 12 Jan 2025 11:17:29 +0100 Subject: [PATCH 17/18] add comments --- src/stdlib_intrinsics.fypp | 52 +++++++++++++++++++++++++- src/stdlib_intrinsics_dot_product.fypp | 3 +- src/stdlib_intrinsics_sum.fypp | 3 +- 3 files changed, 52 insertions(+), 6 deletions(-) diff --git a/src/stdlib_intrinsics.fypp b/src/stdlib_intrinsics.fypp index bf20c0fa3..12dbac190 100644 --- a/src/stdlib_intrinsics.fypp +++ b/src/stdlib_intrinsics.fypp @@ -4,7 +4,6 @@ #:set RC_KINDS_TYPES = R_KINDS_TYPES + C_KINDS_TYPES #:set RANKS = range(2, MAXRANK + 1) -! This module is based on https://github.com/jalvesz/fast_math module stdlib_intrinsics !!Replacement for certain Fortran intrinsic functions offering either faster and/or more accurate implementations. !! ([Specification](../page/specs/stdlib_intrinsics.html)) @@ -12,7 +11,20 @@ module stdlib_intrinsics implicit none private - interface stdlib_sum + interface stdlib_sum + !! version: experimental + !! + !!### Summary + !! Sum elements of rank N arrays. + !! ([Specification](../page/specs/stdlib_intrinsics.html#stdlib_sum)) + !! + !!### Description + !! + !! This interface provides standard conforming call for sum of elements of any rank. + !! The 1-D base implementation follows a chunked approach for optimizing performance and increasing accuracy. + !! The `N-D` interfaces calls upon the `(N-1)-D` implementation. + !! Supported data types include `real` and `complex`. + !! #:for rk, rt, rs in RC_KINDS_TYPES pure module function stdlib_sum_1d_${rs}$(a) result(s) ${rt}$, intent(in) :: a(:) @@ -41,6 +53,18 @@ module stdlib_intrinsics public :: stdlib_sum interface stdlib_sum_kahan + !! version: experimental + !! + !!### Summary + !! Sum elements of rank 1 arrays. + !! ([Specification](../page/specs/stdlib_intrinsics.html#stdlib_sum_kahan)) + !! + !!### Description + !! + !! This interface provides standard conforming call for sum of elements of rank 1. + !! The 1-D base implementation follows a chunked approach combined with a kahan kernel for optimizing performance and increasing accuracy. + !! Supported data types include `real` and `complex`. + !! #:for rk, rt, rs in RC_KINDS_TYPES pure module function stdlib_sum_kahan_1d_${rs}$(a) result(s) ${rt}$, intent(in) :: a(:) @@ -56,6 +80,18 @@ module stdlib_intrinsics public :: stdlib_sum_kahan interface stdlib_dot_product + !! version: experimental + !! + !!### Summary + !! dot_product of rank 1 arrays. + !! ([Specification](../page/specs/stdlib_intrinsics.html#stdlib_dot_product)) + !! + !!### Description + !! + !! compute the dot_product of rank 1 arrays. + !! The 1-D base implementation follows a chunked approach for optimizing performance and increasing accuracy. + !! Supported data types include `real` and `complex`. + !! #:for rk, rt, rs in RC_KINDS_TYPES pure module function stdlib_dot_product_${rs}$(a,b) result(p) ${rt}$, intent(in) :: a(:) @@ -67,6 +103,18 @@ module stdlib_intrinsics public :: stdlib_dot_product interface stdlib_dot_product_kahan + !! version: experimental + !! + !!### Summary + !! dot_product of rank 1 arrays. + !! ([Specification](../page/specs/stdlib_intrinsics.html#stdlib_dot_product_kahan)) + !! + !!### Description + !! + !! compute the dot_product of rank 1 arrays. + !! The implementation follows a chunked approach combined with a kahan kernel for optimizing performance and increasing accuracy. + !! Supported data types include `real` and `complex`. + !! #:for rk, rt, rs in RC_KINDS_TYPES pure module function stdlib_dot_product_kahan_${rs}$(a,b) result(p) ${rt}$, intent(in) :: a(:) diff --git a/src/stdlib_intrinsics_dot_product.fypp b/src/stdlib_intrinsics_dot_product.fypp index 551756a9c..5120c290a 100644 --- a/src/stdlib_intrinsics_dot_product.fypp +++ b/src/stdlib_intrinsics_dot_product.fypp @@ -11,7 +11,6 @@ ${expression}$ #:endif #:enddef -! This module is based on https://github.com/jalvesz/fast_math submodule(stdlib_intrinsics) stdlib_intrinsics_dot_product !!Replacement for certain Fortran intrinsic functions offering either faster and/or more accurate implementations. !! ([Specification](../page/specs/stdlib_intrinsics.html)) @@ -22,7 +21,7 @@ submodule(stdlib_intrinsics) stdlib_intrinsics_dot_product integer, parameter :: chunk = 64 contains - +! This implementation is based on https://github.com/jalvesz/fast_math #:for k1, t1, s1 in RC_KINDS_TYPES pure module function stdlib_dot_product_${s1}$(a,b) result(p) ${t1}$, intent(in) :: a(:) diff --git a/src/stdlib_intrinsics_sum.fypp b/src/stdlib_intrinsics_sum.fypp index f6b771cf9..6d9053345 100644 --- a/src/stdlib_intrinsics_sum.fypp +++ b/src/stdlib_intrinsics_sum.fypp @@ -4,9 +4,7 @@ #:set RC_KINDS_TYPES = R_KINDS_TYPES + C_KINDS_TYPES #:set RANKS = range(2, MAXRANK + 1) -! This module is based on https://github.com/jalvesz/fast_math submodule(stdlib_intrinsics) stdlib_intrinsics_sum - !!Replacement for certain Fortran intrinsic functions offering either faster and/or more accurate implementations. !! ([Specification](../page/specs/stdlib_intrinsics.html)) use stdlib_kinds use stdlib_constants @@ -17,6 +15,7 @@ submodule(stdlib_intrinsics) stdlib_intrinsics_sum contains !================= 1D Base implementations ============ +! This implementation is based on https://github.com/jalvesz/fast_math #:for rk, rt, rs in RC_KINDS_TYPES pure module function stdlib_sum_1d_${rs}$(a) result(s) ${rt}$, intent(in) :: a(:) From 6e36b6f9a79c5aa28e3064a7eb56aebe30e8b17e Mon Sep 17 00:00:00 2001 From: Jose Alves Date: Fri, 17 Jan 2025 19:56:39 +0100 Subject: [PATCH 18/18] extend kahan sum for rank N arrays --- doc/specs/stdlib_intrinsics.md | 10 +++- src/stdlib_intrinsics.fypp | 18 +++++- src/stdlib_intrinsics_sum.fypp | 84 ++++++++++++++++++++++++++-- test/intrinsics/test_intrinsics.fypp | 7 +++ 4 files changed, 110 insertions(+), 9 deletions(-) diff --git a/doc/specs/stdlib_intrinsics.md b/doc/specs/stdlib_intrinsics.md index 023775f02..bba436d7b 100644 --- a/doc/specs/stdlib_intrinsics.md +++ b/doc/specs/stdlib_intrinsics.md @@ -48,7 +48,7 @@ If `dim` is absent, the output is a scalar of the same `type` and `kind` as to t #### Description -The `stdlib_sum_kahan` function can replace the intrinsic `sum` for 1D `real` or `complex` arrays. It follows a chunked implementation which maximizes vectorization potential, complemented by an `elemental` kernel based on the [kahan summation](https://en.wikipedia.org/wiki/Kahan_summation_algorithm) strategy to reduce the round-off error: +The `stdlib_sum_kahan` function can replace the intrinsic `sum` for `real` or `complex` arrays. It follows a chunked implementation which maximizes vectorization potential complemented by an `elemental` kernel based on the [kahan summation](https://en.wikipedia.org/wiki/Kahan_summation_algorithm) strategy to reduce the round-off error: ```fortran elemental subroutine kahan_kernel_(a,s,c) @@ -67,6 +67,8 @@ end subroutine `res = ` [[stdlib_intrinsics(module):stdlib_sum_kahan(interface)]] ` (x [,mask] )` +`res = ` [[stdlib_intrinsics(module):stdlib_sum_kahan(interface)]] ` (x, dim [,mask] )` + #### Status Experimental @@ -79,11 +81,13 @@ Pure function. `x`: 1D array of either `real` or `complex` type. This argument is `intent(in)`. -`mask` (optional): 1D array of `logical` values. This argument is `intent(in)`. +`dim` (optional): scalar of type `integer` with a value in the range from 1 to n, where n equals the rank of `x`. + +`mask` (optional): N-D array of `logical` values, with the same shape as `x`. This argument is `intent(in)`. #### Output value or Result value -The output is a scalar of `type` and `kind` same as to that of `x`. +If `dim` is absent, the output is a scalar of the same `type` and `kind` as to that of `x`. Otherwise, an array of rank n-1, where n equals the rank of `x`, and a shape similar to that of `x` with dimension `dim` dropped is returned. #### Example diff --git a/src/stdlib_intrinsics.fypp b/src/stdlib_intrinsics.fypp index 12dbac190..1910dd0ce 100644 --- a/src/stdlib_intrinsics.fypp +++ b/src/stdlib_intrinsics.fypp @@ -56,13 +56,14 @@ module stdlib_intrinsics !! version: experimental !! !!### Summary - !! Sum elements of rank 1 arrays. + !! Sum elements of rank N arrays. !! ([Specification](../page/specs/stdlib_intrinsics.html#stdlib_sum_kahan)) !! !!### Description !! - !! This interface provides standard conforming call for sum of elements of rank 1. + !! This interface provides standard conforming call for sum of elements of any rank. !! The 1-D base implementation follows a chunked approach combined with a kahan kernel for optimizing performance and increasing accuracy. + !! The `N-D` interfaces calls upon the `(N-1)-D` implementation. !! Supported data types include `real` and `complex`. !! #:for rk, rt, rs in RC_KINDS_TYPES @@ -75,6 +76,19 @@ module stdlib_intrinsics logical, intent(in) :: mask(:) ${rt}$ :: s end function + #:for rank in RANKS + pure module function stdlib_sum_kahan_${rank}$d_${rs}$( x, mask ) result( s ) + ${rt}$, intent(in) :: x${ranksuffix(rank)}$ + logical, intent(in), optional :: mask${ranksuffix(rank)}$ + ${rt}$ :: s + end function + pure module function stdlib_sum_kahan_${rank}$d_dim_${rs}$( x , dim, mask ) result( s ) + ${rt}$, intent(in) :: x${ranksuffix(rank)}$ + integer, intent(in):: dim + logical, intent(in), optional :: mask${ranksuffix(rank)}$ + ${rt}$ :: s${reduced_shape('x', rank, 'dim')}$ + end function + #:endfor #:endfor end interface public :: stdlib_sum_kahan diff --git a/src/stdlib_intrinsics_sum.fypp b/src/stdlib_intrinsics_sum.fypp index 6d9053345..84b449608 100644 --- a/src/stdlib_intrinsics_sum.fypp +++ b/src/stdlib_intrinsics_sum.fypp @@ -9,8 +9,6 @@ submodule(stdlib_intrinsics) stdlib_intrinsics_sum use stdlib_kinds use stdlib_constants implicit none - - integer, parameter :: chunk = 64 contains @@ -18,6 +16,7 @@ contains ! This implementation is based on https://github.com/jalvesz/fast_math #:for rk, rt, rs in RC_KINDS_TYPES pure module function stdlib_sum_1d_${rs}$(a) result(s) + integer, parameter :: chunk = 64 ${rt}$, intent(in) :: a(:) ${rt}$ :: s ${rt}$ :: abatch(chunk) @@ -39,6 +38,7 @@ pure module function stdlib_sum_1d_${rs}$(a) result(s) end function pure module function stdlib_sum_1d_${rs}$_mask(a,mask) result(s) + integer, parameter :: chunk = 64 ${rt}$, intent(in) :: a(:) logical, intent(in) :: mask(:) ${rt}$ :: s @@ -61,13 +61,14 @@ pure module function stdlib_sum_1d_${rs}$_mask(a,mask) result(s) end function pure module function stdlib_sum_kahan_1d_${rs}$(a) result(s) + integer, parameter :: chunk = 64 ${rt}$, intent(in) :: a(:) ${rt}$ :: s ${rt}$ :: sbatch(chunk) ${rt}$ :: cbatch(chunk) integer :: i, dr, rr ! ----------------------------- - dr = size(a)/(chunk) + dr = size(a)/chunk rr = size(a) - dr*chunk sbatch = zero_${rs}$ cbatch = zero_${rs}$ @@ -83,6 +84,7 @@ pure module function stdlib_sum_kahan_1d_${rs}$(a) result(s) end function pure module function stdlib_sum_kahan_1d_${rs}$_mask(a,mask) result(s) + integer, parameter :: chunk = 64 ${rt}$, intent(in) :: a(:) logical, intent(in) :: mask(:) ${rt}$ :: s @@ -90,7 +92,7 @@ pure module function stdlib_sum_kahan_1d_${rs}$_mask(a,mask) result(s) ${rt}$ :: cbatch(chunk) integer :: i, dr, rr ! ----------------------------- - dr = size(a)/(chunk) + dr = size(a)/chunk rr = size(a) - dr*chunk sbatch = zero_${rs}$ cbatch = zero_${rs}$ @@ -181,4 +183,78 @@ end function #:endfor #:endfor +#:for rk, rt, rs in RC_KINDS_TYPES +#:for rank in RANKS +pure module function stdlib_sum_kahan_${rank}$d_${rs}$( x , mask ) result( s ) + ${rt}$, intent(in) :: x${ranksuffix(rank)}$ + logical, intent(in), optional :: mask${ranksuffix(rank)}$ + ${rt}$ :: s + if(.not.present(mask)) then + s = sum_recast(x,size(x)) + else + s = sum_recast_mask(x,mask,size(x)) + end if +contains + pure ${rt}$ function sum_recast(b,n) + integer, intent(in) :: n + ${rt}$, intent(in) :: b(n) + sum_recast = stdlib_sum_kahan(b) + end function + pure ${rt}$ function sum_recast_mask(b,m,n) + integer, intent(in) :: n + ${rt}$, intent(in) :: b(n) + logical, intent(in) :: m(n) + sum_recast_mask = stdlib_sum_kahan(b,m) + end function +end function + +pure module function stdlib_sum_kahan_${rank}$d_dim_${rs}$( x , dim, mask ) result( s ) + ${rt}$, intent(in) :: x${ranksuffix(rank)}$ + integer, intent(in):: dim + logical, intent(in), optional :: mask${ranksuffix(rank)}$ + ${rt}$ :: s${reduced_shape('x', rank, 'dim')}$ + integer :: j + + if(.not.present(mask)) then + if(dim<${rank}$)then + do j = 1, size(x,dim=${rank}$) + #:if rank == 2 + s${select_subarray(rank-1, [(rank-1, 'j')])}$ = stdlib_sum_kahan( x${select_subarray(rank, [(rank, 'j')])}$ ) + #:else + s${select_subarray(rank-1, [(rank-1, 'j')])}$ = stdlib_sum_kahan( x${select_subarray(rank, [(rank, 'j')])}$, dim=dim ) + #:endif + end do + else + do j = 1, size(x,dim=1) + #:if rank == 2 + s${select_subarray(rank-1, [(1, 'j')])}$ = stdlib_sum_kahan( x${select_subarray(rank, [(1, 'j')])}$ ) + #:else + s${select_subarray(rank-1, [(1, 'j')])}$ = stdlib_sum_kahan( x${select_subarray(rank, [(1, 'j')])}$, dim=${rank-1}$ ) + #:endif + end do + end if + else + if(dim<${rank}$)then + do j = 1, size(x,dim=${rank}$) + #:if rank == 2 + s${select_subarray(rank-1, [(rank-1, 'j')])}$ = stdlib_sum_kahan( x${select_subarray(rank, [(rank, 'j')])}$, mask=mask${select_subarray(rank, [(rank, 'j')])}$ ) + #:else + s${select_subarray(rank-1, [(rank-1, 'j')])}$ = stdlib_sum_kahan( x${select_subarray(rank, [(rank, 'j')])}$, dim=dim, mask=mask${select_subarray(rank, [(rank, 'j')])}$ ) + #:endif + end do + else + do j = 1, size(x,dim=1) + #:if rank == 2 + s${select_subarray(rank-1, [(1, 'j')])}$ = stdlib_sum_kahan( x${select_subarray(rank, [(1, 'j')])}$, mask=mask${select_subarray(rank, [(1, 'j')])}$ ) + #:else + s${select_subarray(rank-1, [(1, 'j')])}$ = stdlib_sum_kahan( x${select_subarray(rank, [(1, 'j')])}$, dim=${rank-1}$, mask=mask${select_subarray(rank, [(1, 'j')])}$ ) + #:endif + end do + end if + end if + +end function +#:endfor +#:endfor + end submodule stdlib_intrinsics_sum diff --git a/test/intrinsics/test_intrinsics.fypp b/test/intrinsics/test_intrinsics.fypp index a8a14b701..7ef7eddc4 100644 --- a/test/intrinsics/test_intrinsics.fypp +++ b/test/intrinsics/test_intrinsics.fypp @@ -127,11 +127,18 @@ subroutine test_sum(error) call check(error, abs( sum(x) - stdlib_sum(x) ) sum over specific rank dim do i = 1, rank(x) call check(error, norm2( sum(x,dim=i) - stdlib_sum(x,dim=i) )