From 08ec0aa208ed11e4d36d0a1205e8a6594f1dd9c3 Mon Sep 17 00:00:00 2001
From: jalvesz <jose.alves@transvalor.com>
Date: Tue, 24 Dec 2024 13:12:50 +0100
Subject: [PATCH 01/18] intrinsics module with fast sums

---
 src/stdlib_intrinsics.fypp | 159 +++++++++++++++++++++++++++++++++++++
 1 file changed, 159 insertions(+)
 create mode 100644 src/stdlib_intrinsics.fypp

diff --git a/src/stdlib_intrinsics.fypp b/src/stdlib_intrinsics.fypp
new file mode 100644
index 000000000..086c58491
--- /dev/null
+++ b/src/stdlib_intrinsics.fypp
@@ -0,0 +1,159 @@
+#:include "common.fypp"
+#:set R_KINDS_TYPES = list(zip(REAL_KINDS, REAL_TYPES, REAL_SUFFIX))
+#:set C_KINDS_TYPES = list(zip(CMPLX_KINDS, CMPLX_TYPES, CMPLX_SUFFIX))
+#:set RANKS = range(2, 2)
+module stdlib_intrinsics
+    use stdlib_kinds
+    implicit none
+    private
+
+    interface fsum 
+        #:for rk, rt in REAL_KINDS_TYPES
+        module procedure :: fsum_1d_${rk}$
+        module procedure :: fsum_1d_${rk}$_mask
+        #:endfor
+    end interface
+    public :: fsum
+
+    interface fsum_kahan
+        #:for rk, rt in REAL_KINDS_TYPES
+        module procedure :: fsum_kahan_1d_${rk}$
+        module procedure :: fsum_kahan_1d_${rk}$_mask
+        #:endfor
+    end interface
+    public :: fsum_kahan
+
+    interface vkahan 
+        #:for rk, rt in REAL_KINDS_TYPES
+        module procedure :: vkahan_${rk}$
+        module procedure :: vkahan_m_${rk}$
+        #:endfor
+    end interface
+
+    #:for k1, t1, s1 in (R_KINDS_TYPES)
+    ${t1}$, parameter :: zero_${s1}$ = 0._${k1}$
+    #:endfor
+    #:for k1, t1, s1 in (C_KINDS_TYPES)
+    ${t1}$, parameter :: zero_${s1}$ = (0._${k1}$,0._${k1}$)
+    #:endfor
+    integer, parameter :: chunk = 64
+    
+contains
+
+#:for k1, t1, s1 in R_KINDS_TYPES
+pure function fsum_1d_${s1}$(a) result(s)
+    ${t1}$, intent(in) :: a(:)
+    ${t1}$ :: s
+    ${t1}$ :: abatch(chunk)
+    integer :: i, dr, rr
+    ! -----------------------------
+    dr = size(a)/chunk
+    rr = size(a) - dr*chunk
+
+    abatch = zero_${s1}$
+    do i = 1, dr
+      abatch(1:chunk) = abatch(1:chunk) + a(chunk*i-chunk+1:chunk*i)
+    end do
+    abatch(1:rr) = abatch(1:rr) + a(size(a)-rr+1:size(a))
+
+    s = zero_${s1}$
+    do i = 1, chunk/2
+      s = s + abatch(i)+abatch(chunk/2+i)
+    end do
+end function
+
+pure function fsum_1d_${s1}$_mask(a,mask) result(s)
+    ${t1}$, intent(in) :: a(:)
+    logical, intent(in) :: mask(:)
+    ${t1}$ :: s
+    ${t1}$ :: abatch(chunk)
+    integer :: i, dr, rr
+    ! -----------------------------
+    dr = size(a)/chunk
+    rr = size(a) - dr*chunk
+
+    abatch = zero_${s1}$
+    do i = 1, dr
+      abatch(1:chunk) = abatch(1:chunk) + merge( zero_${s1}$ , a(chunk*i-chunk+1:chunk*i) , mask(chunk*i-chunk+1:chunk*i) )
+    end do
+    abatch(1:rr) = abatch(1:rr) + merge( zero_${s1}$ , a(size(a)-rr+1:size(a)) , mask(size(a)-rr+1:size(a)) )
+    
+    s = zero_${s1}$
+    do i = 1, chunk/2
+        s = s + abatch(i)+abatch(chunk/2+i)
+    end do
+end function
+#:endfor
+
+#:for k1, t1, s1 in R_KINDS_TYPES
+pure function fsum_kahan_1d_${s1}$(a) result(s)
+    ${t1}$, intent(in) :: a(:)
+    ${t1}$ :: s
+    ${t1}$ :: sbatch(chunk)
+    ${t1}$ :: cbatch(chunk)
+    integer :: i, dr, rr
+    ! -----------------------------
+    dr = size(a)/(chunk)
+    rr = size(a) - dr*chunk     
+    sbatch = zero_${s1}$
+    cbatch = zero_${s1}$
+    do i = 1, dr
+        call vkahan( a(chunk*i-chunk+1:chunk*i) , sbatch(1:chunk) , cbatch(1:chunk) )
+    end do
+    call vkahan( a(size(a)-rr+1:size(a)) , sbatch(1:rr) , cbatch(1:rr) )      
+
+    s = zero_${s1}$
+    do i = 1,chunk
+        call vkahan( sbatch(i) , s , cbatch(i) )
+    end do
+end function
+
+pure function fsum_kahan_1d_${s1}$_mask(a,mask) result(s)
+    ${t1}$, intent(in) :: a(:)
+    logical, intent(in) :: mask(:)
+    ${t1}$ :: s
+    ${t1}$ :: sbatch(chunk)
+    ${t1}$ :: cbatch(chunk)
+    integer :: i, dr, rr
+    ! -----------------------------
+    dr = size(a)/(chunk)
+    rr = size(a) - dr*chunk     
+    sbatch = zero_${s1}$
+    cbatch = zero_${s1}$
+    do i = 1, dr
+        call vkahan( a(chunk*i-chunk+1:chunk*i) , sbatch(1:chunk) , cbatch(1:chunk) , mask(chunk*i-chunk+1:chunk*i) )
+    end do
+    call vkahan( a(size(a)-rr+1:size(a)) , sbatch(1:rr) , cbatch(1:rr) , mask(size(a)-rr+1:size(a)) )
+
+    s = zero_${s1}$
+    do i = 1,chunk
+        call vkahan( sbatch(i) , s , cbatch(i) )
+    end do
+end function
+#:endfor
+
+#:for k1, t1, s1 in R_KINDS_TYPES
+elemental subroutine vkahan_${s1}$(a,s,c)
+    ${t1}$, intent(in) :: a
+    ${t1}$, intent(inout) :: s
+    ${t1}$, intent(inout) :: c
+    ${t1}$ :: t, y
+    y = a - c
+    t = s + y
+    c = (t - s) - y
+    s = t
+end subroutine  
+elemental subroutine vkahan_m_${s1}$(a,s,c,m)
+    ${t1}$, intent(in) :: a
+    ${t1}$, intent(inout) :: s
+    ${t1}$, intent(inout) :: c
+    logical, intent(in) :: m
+    ${t1}$ :: t, y
+    y = a - c
+    t = s + y
+    c = (t - s) - y
+    s = merge( s , t , m )
+end subroutine 
+#:endfor
+
+end module stdlib_intrinsics

From 2bc7af92e1923095be5909c64858d0f99a60755f Mon Sep 17 00:00:00 2001
From: jalvesz <jose.alves@transvalor.com>
Date: Thu, 26 Dec 2024 21:04:10 +0100
Subject: [PATCH 02/18] add fast dot_product and start tests

---
 src/CMakeLists.txt                   |   1 +
 src/stdlib_intrinsics.fypp           | 100 +++++++++++++++++++++++----
 test/CMakeLists.txt                  |   1 +
 test/intrinsics/CMakeLists.txt       |   7 ++
 test/intrinsics/test_intrinsics.fypp |  94 +++++++++++++++++++++++++
 5 files changed, 190 insertions(+), 13 deletions(-)
 create mode 100644 test/intrinsics/CMakeLists.txt
 create mode 100644 test/intrinsics/test_intrinsics.fypp

diff --git a/src/CMakeLists.txt b/src/CMakeLists.txt
index ff9f39417..5f20d9f2a 100644
--- a/src/CMakeLists.txt
+++ b/src/CMakeLists.txt
@@ -16,6 +16,7 @@ set(fppFiles
     stdlib_hash_64bit_fnv.fypp
     stdlib_hash_64bit_pengy.fypp
     stdlib_hash_64bit_spookyv2.fypp
+    stdlib_intrinsics.fypp
     stdlib_io.fypp
     stdlib_io_npy.fypp
     stdlib_io_npy_load.fypp
diff --git a/src/stdlib_intrinsics.fypp b/src/stdlib_intrinsics.fypp
index 086c58491..dcbca0e70 100644
--- a/src/stdlib_intrinsics.fypp
+++ b/src/stdlib_intrinsics.fypp
@@ -1,32 +1,58 @@
 #:include "common.fypp"
 #:set R_KINDS_TYPES = list(zip(REAL_KINDS, REAL_TYPES, REAL_SUFFIX))
 #:set C_KINDS_TYPES = list(zip(CMPLX_KINDS, CMPLX_TYPES, CMPLX_SUFFIX))
-#:set RANKS = range(2, 2)
+#:set RC_KINDS_TYPES = R_KINDS_TYPES + C_KINDS_TYPES
+
+#:def cnjg(type,expression)
+#:if 'complex' in type
+conjg(${expression}$)
+#:else
+${expression}$
+#:endif
+#:enddef
+
+! This module is based on https://github.com/jalvesz/fast_math
 module stdlib_intrinsics
+    !!Replacement for certain Fortran intrinsic functions offering either faster and/or more accurate implementations.
+    !! ([Specification](../page/specs/stdlib_intrinsics.html))
     use stdlib_kinds
     implicit none
     private
 
     interface fsum 
-        #:for rk, rt in REAL_KINDS_TYPES
-        module procedure :: fsum_1d_${rk}$
-        module procedure :: fsum_1d_${rk}$_mask
+        #:for rk, rt, rs in RC_KINDS_TYPES
+        module procedure :: fsum_1d_${rs}$
+        module procedure :: fsum_1d_${rs}$_mask
         #:endfor
     end interface
     public :: fsum
 
     interface fsum_kahan
-        #:for rk, rt in REAL_KINDS_TYPES
-        module procedure :: fsum_kahan_1d_${rk}$
-        module procedure :: fsum_kahan_1d_${rk}$_mask
+        #:for rk, rt, rs in RC_KINDS_TYPES
+        module procedure :: fsum_kahan_1d_${rs}$
+        module procedure :: fsum_kahan_1d_${rs}$_mask
         #:endfor
     end interface
     public :: fsum_kahan
 
+    interface fprod
+        #:for rk, rt, rs in RC_KINDS_TYPES
+        module procedure :: fprod_${rs}$
+        #:endfor
+    end interface
+    public :: fprod
+
+    interface fprod_kahan
+        #:for rk, rt, rs in RC_KINDS_TYPES
+        module procedure :: fprod_kahan_${rs}$
+        #:endfor
+    end interface
+    public :: fprod_kahan
+
     interface vkahan 
-        #:for rk, rt in REAL_KINDS_TYPES
-        module procedure :: vkahan_${rk}$
-        module procedure :: vkahan_m_${rk}$
+        #:for rk, rt, rs in RC_KINDS_TYPES
+        module procedure :: vkahan_${rs}$
+        module procedure :: vkahan_m_${rs}$
         #:endfor
     end interface
 
@@ -40,7 +66,7 @@ module stdlib_intrinsics
     
 contains
 
-#:for k1, t1, s1 in R_KINDS_TYPES
+#:for k1, t1, s1 in RC_KINDS_TYPES
 pure function fsum_1d_${s1}$(a) result(s)
     ${t1}$, intent(in) :: a(:)
     ${t1}$ :: s
@@ -85,7 +111,7 @@ pure function fsum_1d_${s1}$_mask(a,mask) result(s)
 end function
 #:endfor
 
-#:for k1, t1, s1 in R_KINDS_TYPES
+#:for k1, t1, s1 in RC_KINDS_TYPES
 pure function fsum_kahan_1d_${s1}$(a) result(s)
     ${t1}$, intent(in) :: a(:)
     ${t1}$ :: s
@@ -132,7 +158,55 @@ pure function fsum_kahan_1d_${s1}$_mask(a,mask) result(s)
 end function
 #:endfor
 
-#:for k1, t1, s1 in R_KINDS_TYPES
+#:for k1, t1, s1 in RC_KINDS_TYPES
+pure function fprod_${s1}$(a,b) result(p)
+    ${t1}$, intent(in) :: a(:)
+    ${t1}$, intent(in) :: b(:)
+    ${t1}$ :: p
+    ${t1}$ :: abatch(chunk)
+    integer :: i, dr, rr
+    ! -----------------------------
+    dr = size(a)/chunk
+    rr = size(a) - dr*chunk
+
+    abatch = zero_${s1}$
+    do i = 1, dr
+      abatch(1:chunk) = abatch(1:chunk) + a(chunk*i-chunk+1:chunk*i)*${cnjg(t1,'b(chunk*i-chunk+1:chunk*i)')}$
+    end do
+    abatch(1:rr) = abatch(1:rr) + a(size(a)-rr+1:size(a))*${cnjg(t1,'b(size(a)-rr+1:size(a))')}$
+
+    p = zero_${s1}$
+    do i = 1, chunk/2
+      p = p + abatch(i)+abatch(chunk/2+i)
+    end do
+end function
+
+pure function fprod_kahan_${s1}$(a,b) result(p)
+    ${t1}$, intent(in) :: a(:)
+    ${t1}$, intent(in) :: b(:)
+    ${t1}$ :: p
+    ${t1}$ :: pbatch(chunk)
+    ${t1}$ :: cbatch(chunk)
+    integer :: i, dr, rr
+    ! -----------------------------
+    dr = size(a)/(chunk)
+    rr = size(a) - dr*chunk     
+    pbatch = zero_${s1}$
+    cbatch = zero_${s1}$
+    do i = 1, dr
+        call vkahan( a(chunk*i-chunk+1:chunk*i)*${cnjg(t1,'b(chunk*i-chunk+1:chunk*i)')}$ , pbatch(1:chunk) , cbatch(1:chunk) )
+    end do
+    call vkahan( a(size(a)-rr+1:size(a))*${cnjg(t1,'b(size(a)-rr+1:size(a))')}$ , pbatch(1:rr) , cbatch(1:rr) )      
+
+    p = zero_${s1}$
+    do i = 1,chunk
+        call vkahan( pbatch(i) , p , cbatch(i) )
+    end do
+end function
+
+#:endfor
+
+#:for k1, t1, s1 in RC_KINDS_TYPES
 elemental subroutine vkahan_${s1}$(a,s,c)
     ${t1}$, intent(in) :: a
     ${t1}$, intent(inout) :: s
diff --git a/test/CMakeLists.txt b/test/CMakeLists.txt
index 4d83548db..566c0e6d8 100644
--- a/test/CMakeLists.txt
+++ b/test/CMakeLists.txt
@@ -17,6 +17,7 @@ add_subdirectory(constants)
 add_subdirectory(hash_functions)
 add_subdirectory(hash_functions_perf)
 add_subdirectory(hashmaps)
+add_subdirectory(intrinsics)
 add_subdirectory(io)
 add_subdirectory(linalg)
 add_subdirectory(logger)
diff --git a/test/intrinsics/CMakeLists.txt b/test/intrinsics/CMakeLists.txt
new file mode 100644
index 000000000..cc5e9fb87
--- /dev/null
+++ b/test/intrinsics/CMakeLists.txt
@@ -0,0 +1,7 @@
+set(
+  fppFiles
+  "test_intrinsics.fypp"
+)
+fypp_f90("${fyppFlags}" "${fppFiles}" outFiles)
+
+ADDTEST(intrinsics)
diff --git a/test/intrinsics/test_intrinsics.fypp b/test/intrinsics/test_intrinsics.fypp
new file mode 100644
index 000000000..d7f579b52
--- /dev/null
+++ b/test/intrinsics/test_intrinsics.fypp
@@ -0,0 +1,94 @@
+#:include "common.fypp"
+#:set R_KINDS_TYPES = list(zip(REAL_KINDS, REAL_TYPES, REAL_SUFFIX))
+#:set C_KINDS_TYPES = list(zip(CMPLX_KINDS, CMPLX_TYPES, CMPLX_SUFFIX))
+#:set RC_KINDS_TYPES = R_KINDS_TYPES + C_KINDS_TYPES
+
+module test_intrinsics
+    use testdrive, only : new_unittest, unittest_type, error_type, check, skip_test
+    use stdlib_kinds, only: sp, dp, xdp, qp, int8, int16, int32, int64
+    use stdlib_intrinsics
+    use stdlib_math, only: swap
+    implicit none
+    
+contains
+
+!> Collect all exported unit tests
+subroutine collect_suite(testsuite)
+    !> Collection of tests
+    type(unittest_type), allocatable, intent(out) :: testsuite(:)
+
+    testsuite = [ &
+        new_unittest('sum', test_sum) &
+        !new_unittest('dot_product', test_dot_product) &
+    ]
+end subroutine
+
+subroutine test_sum(error)
+    !> Error handling
+    type(error_type), allocatable, intent(out) :: error
+
+    !> Internal parameters and variables
+    integer, parameter :: n = 1e3, ncalc = 3, niter = 100
+    real(sp) :: u
+    integer :: iter, i, j
+    !====================================================================================
+    #:for k1, t1, s1 in R_KINDS_TYPES
+    block
+        ${t1}$, allocatable :: x(:)
+        ${t1}$, parameter :: total_sum = 4*atan(1._${k1}$), tolerance = epsilon(1._${k1}$)*100
+        ${t1}$ :: xsum(ncalc), meanval(ncalc), err(ncalc)
+
+        allocate(x(n))
+        do i = 1, n 
+            x(i) = 8*atan(1._${k1}$)*(real(i,kind=${k1}$)-0.5_${k1}$)/real(n,kind=${k1}$)**2
+        end do
+        ! scramble array
+        do i = 1, n
+            call random_number(u) 
+            j = 1 + FLOOR(n*u)
+            call swap( x(i), x(j) )
+        end do
+        
+        err(:) = 0._${k1}$
+        do iter = 1, niter
+            xsum(1) = sum(x)
+            xsum(2) = fsum_kahan(x)
+            xsum(3) = fsum(x) 
+            err(1:ncalc) = err(1:ncalc) + abs(1._${k1}$-xsum(1:ncalc)/total_sum)
+        end do
+        err(1:ncalc) = err(1:ncalc) / niter 
+        print *, err
+        call check(error, all(err(:)<tolerance) )
+        if (allocated(error)) return
+    end block
+    #:endfor
+
+end subroutine
+    
+end module test_intrinsics
+
+program tester
+    use, intrinsic :: iso_fortran_env, only : error_unit
+    use testdrive, only : run_testsuite, new_testsuite, testsuite_type
+    use test_intrinsics, only : collect_suite
+    implicit none
+    integer :: stat, is
+    type(testsuite_type), allocatable :: testsuites(:)
+    character(len=*), parameter :: fmt = '("#", *(1x, a))'
+
+    stat = 0
+
+    testsuites = [ &
+        new_testsuite("sparse", collect_suite) &
+        ]
+
+    do is = 1, size(testsuites)
+        write(error_unit, fmt) "Testing:", testsuites(is)%name
+        call run_testsuite(testsuites(is)%collect, error_unit, stat)
+    end do
+
+    if (stat > 0) then
+        write(error_unit, '(i0, 1x, a)') stat, "test(s) failed!"
+        error stop
+    end if
+end program
\ No newline at end of file

From 243ea6fd68f3a643d7c6cd14f0008e87fd370242 Mon Sep 17 00:00:00 2001
From: jalvesz <jose.alves@transvalor.com>
Date: Thu, 26 Dec 2024 23:20:38 +0100
Subject: [PATCH 03/18] add complex sum test

---
 test/intrinsics/test_intrinsics.fypp | 46 ++++++++++++++++++++++++----
 1 file changed, 40 insertions(+), 6 deletions(-)

diff --git a/test/intrinsics/test_intrinsics.fypp b/test/intrinsics/test_intrinsics.fypp
index d7f579b52..accc3a822 100644
--- a/test/intrinsics/test_intrinsics.fypp
+++ b/test/intrinsics/test_intrinsics.fypp
@@ -45,20 +45,54 @@ subroutine test_sum(error)
         ! scramble array
         do i = 1, n
             call random_number(u) 
-            j = 1 + FLOOR(n*u)
+            j = 1 + floor(n*u)
             call swap( x(i), x(j) )
         end do
         
         err(:) = 0._${k1}$
         do iter = 1, niter
-            xsum(1) = sum(x)
-            xsum(2) = fsum_kahan(x)
-            xsum(3) = fsum(x) 
+            xsum(1) = sum(x)        ! compiler intrinsic
+            xsum(2) = fsum_kahan(x) ! chunked Kahan summation
+            xsum(3) = fsum(x)       ! chunked summation
             err(1:ncalc) = err(1:ncalc) + abs(1._${k1}$-xsum(1:ncalc)/total_sum)
         end do
         err(1:ncalc) = err(1:ncalc) / niter 
-        print *, err
-        call check(error, all(err(:)<tolerance) )
+        
+        call check(error, all(err(:)<tolerance) , "real sum is not accurate" )
+        if (allocated(error)) return
+    end block
+    #:endfor
+
+    #:for k1, t1, s1 in C_KINDS_TYPES
+    block
+        ${t1}$, allocatable :: x(:)
+        real(${k1}$), parameter :: total_sum = 4*atan(1._${k1}$), tolerance = epsilon(1._${k1}$)*100
+        real(${k1}$) :: err(ncalc)
+        ${t1}$ :: xsum(ncalc), meanval(ncalc)
+
+        allocate(x(n))
+        do i = 1, n
+            x(i) = complex(&
+                8*atan(1._${k1}$)*(real(i,kind=${k1}$)-0.5_${k1}$)/real(2*n,kind=${k1}$)**2,&
+                8*atan(1._${k1}$)*(real(i+n,kind=${k1}$)-0.5_${k1}$)/real(2*n,kind=${k1}$)**2)
+        end do
+        ! scramble array
+        do i = 1, n
+            call random_number(u) 
+            j = 1 + floor(n*u)
+            call swap( x(i), x(j) )
+        end do
+        
+        err(:) = 0._${k1}$
+        do iter = 1, niter
+            xsum(1) = sum(x)        ! compiler intrinsic
+            xsum(2) = fsum_kahan(x) ! chunked Kahan summation
+            xsum(3) = fsum(x)       ! chunked summation
+            err(1:ncalc) = err(1:ncalc) + abs(1._${k1}$-(xsum(1:ncalc)%re+xsum(1:ncalc)%im)/total_sum)
+        end do
+        err(1:ncalc) = err(1:ncalc) / niter 
+        
+        call check(error, all(err(:)<tolerance) , "complex sum is not accurate" )
         if (allocated(error)) return
     end block
     #:endfor

From c38dcd603873d276498d8c4ea27707ed9633c66e Mon Sep 17 00:00:00 2001
From: jalvesz <jose.alves@transvalor.com>
Date: Thu, 26 Dec 2024 23:36:05 +0100
Subject: [PATCH 04/18] test masked sum

---
 test/intrinsics/test_intrinsics.fypp | 34 ++++++++++++++++++++++++++++
 1 file changed, 34 insertions(+)

diff --git a/test/intrinsics/test_intrinsics.fypp b/test/intrinsics/test_intrinsics.fypp
index accc3a822..790250a68 100644
--- a/test/intrinsics/test_intrinsics.fypp
+++ b/test/intrinsics/test_intrinsics.fypp
@@ -37,16 +37,21 @@ subroutine test_sum(error)
         ${t1}$, allocatable :: x(:)
         ${t1}$, parameter :: total_sum = 4*atan(1._${k1}$), tolerance = epsilon(1._${k1}$)*100
         ${t1}$ :: xsum(ncalc), meanval(ncalc), err(ncalc)
+        logical, allocatable :: mask(:), nmask(:)
 
         allocate(x(n))
         do i = 1, n 
             x(i) = 8*atan(1._${k1}$)*(real(i,kind=${k1}$)-0.5_${k1}$)/real(n,kind=${k1}$)**2
         end do
+        allocate(mask(n),source=.false.); mask(1:n:2) = .true.
+        allocate(nmask(n)); nmask = .not.mask
         ! scramble array
         do i = 1, n
             call random_number(u) 
             j = 1 + floor(n*u)
             call swap( x(i), x(j) )
+            call swap( mask(i), mask(j) )
+            call swap( nmask(i), nmask(j) )
         end do
         
         err(:) = 0._${k1}$
@@ -60,6 +65,18 @@ subroutine test_sum(error)
         
         call check(error, all(err(:)<tolerance) , "real sum is not accurate" )
         if (allocated(error)) return
+
+        err(:) = 0._${k1}$
+        do iter = 1, niter
+            xsum(1) = sum(x,mask)+sum(x,nmask) ! compiler intrinsic
+            xsum(2) = fsum_kahan(x,mask)+fsum_kahan(x,nmask) ! chunked Kahan summation
+            xsum(3) = fsum(x,mask)+fsum(x,nmask) ! chunked summation
+            err(1:ncalc) = err(1:ncalc) + abs(1._${k1}$-xsum(1:ncalc)/total_sum)
+        end do
+        err(1:ncalc) = err(1:ncalc) / niter 
+        
+        call check(error, all(err(:)<tolerance) , "masked real sum is not accurate" )
+        if (allocated(error)) return
     end block
     #:endfor
 
@@ -69,6 +86,7 @@ subroutine test_sum(error)
         real(${k1}$), parameter :: total_sum = 4*atan(1._${k1}$), tolerance = epsilon(1._${k1}$)*100
         real(${k1}$) :: err(ncalc)
         ${t1}$ :: xsum(ncalc), meanval(ncalc)
+        logical, allocatable :: mask(:), nmask(:)
 
         allocate(x(n))
         do i = 1, n
@@ -76,11 +94,15 @@ subroutine test_sum(error)
                 8*atan(1._${k1}$)*(real(i,kind=${k1}$)-0.5_${k1}$)/real(2*n,kind=${k1}$)**2,&
                 8*atan(1._${k1}$)*(real(i+n,kind=${k1}$)-0.5_${k1}$)/real(2*n,kind=${k1}$)**2)
         end do
+        allocate(mask(n),source=.false.); mask(1:n:2) = .true.
+        allocate(nmask(n)); nmask = .not.mask
         ! scramble array
         do i = 1, n
             call random_number(u) 
             j = 1 + floor(n*u)
             call swap( x(i), x(j) )
+            call swap( mask(i), mask(j) )
+            call swap( nmask(i), nmask(j) )
         end do
         
         err(:) = 0._${k1}$
@@ -94,6 +116,18 @@ subroutine test_sum(error)
         
         call check(error, all(err(:)<tolerance) , "complex sum is not accurate" )
         if (allocated(error)) return
+
+        err(:) = 0._${k1}$
+        do iter = 1, niter
+            xsum(1) = sum(x,mask)+sum(x,nmask) ! compiler intrinsic
+            xsum(2) = fsum_kahan(x,mask)+fsum_kahan(x,nmask) ! chunked Kahan summation
+            xsum(3) = fsum(x,mask)+fsum(x,nmask) ! chunked summation
+            err(1:ncalc) = err(1:ncalc) + abs(1._${k1}$-(xsum(1:ncalc)%re+xsum(1:ncalc)%im)/total_sum)
+        end do
+        err(1:ncalc) = err(1:ncalc) / niter 
+        
+        call check(error, all(err(:)<tolerance) , "complex masked sum is not accurate" )
+        if (allocated(error)) return
     end block
     #:endfor
 

From bf1ce2f3818bbe6003d6d74f89c9af1f9dfabdd1 Mon Sep 17 00:00:00 2001
From: jalvesz <jose.alves@transvalor.com>
Date: Thu, 26 Dec 2024 23:49:06 +0100
Subject: [PATCH 05/18] add dot_product tests

---
 test/intrinsics/test_intrinsics.fypp | 80 +++++++++++++++++++++++++++-
 1 file changed, 78 insertions(+), 2 deletions(-)

diff --git a/test/intrinsics/test_intrinsics.fypp b/test/intrinsics/test_intrinsics.fypp
index 790250a68..f83f60759 100644
--- a/test/intrinsics/test_intrinsics.fypp
+++ b/test/intrinsics/test_intrinsics.fypp
@@ -18,8 +18,8 @@ subroutine collect_suite(testsuite)
     type(unittest_type), allocatable, intent(out) :: testsuite(:)
 
     testsuite = [ &
-        new_unittest('sum', test_sum) &
-        !new_unittest('dot_product', test_dot_product) &
+        new_unittest('sum', test_sum), &
+        new_unittest('dot_product', test_dot_product) &
     ]
 end subroutine
 
@@ -132,6 +132,82 @@ subroutine test_sum(error)
     #:endfor
 
 end subroutine
+
+subroutine test_dot_product(error)
+    !> Error handling
+    type(error_type), allocatable, intent(out) :: error
+
+    !> Internal parameters and variables
+    integer, parameter :: n = 1e3, ncalc = 3, niter = 100
+    real(sp) :: u
+    integer :: iter, i, j
+    !====================================================================================
+    #:for k1, t1, s1 in R_KINDS_TYPES
+    block
+        ${t1}$, allocatable :: x(:)
+        ${t1}$, parameter :: total_sum = 4*atan(1._${k1}$), tolerance = epsilon(1._${k1}$)*100
+        ${t1}$ :: xsum(ncalc), meanval(ncalc), err(ncalc)
+
+        allocate(x(n))
+        do i = 1, n 
+            x(i) = sqrt( 8*atan(1._${k1}$)*(real(i,kind=${k1}$)-0.5_${k1}$)/real(n,kind=${k1}$)**2 )
+        end do
+        ! scramble array
+        do i = 1, n
+            call random_number(u) 
+            j = 1 + floor(n*u)
+            call swap( x(i), x(j) )
+        end do
+        
+        err(:) = 0._${k1}$
+        do iter = 1, niter
+            xsum(1) = dot_product(x,x) ! compiler intrinsic
+            xsum(2) = fprod_kahan(x,x) ! chunked Kahan summation
+            xsum(3) = fprod(x,x)       ! chunked summation
+            err(1:ncalc) = err(1:ncalc) + abs(1._${k1}$-xsum(1:ncalc)/total_sum)
+        end do
+        err(1:ncalc) = err(1:ncalc) / niter 
+        
+        call check(error, all(err(:)<tolerance) , "real dot_product is not accurate" )
+        if (allocated(error)) return
+    end block
+    #:endfor
+
+    #:for k1, t1, s1 in C_KINDS_TYPES
+    block
+        ${t1}$, allocatable :: x(:)
+        real(${k1}$), parameter :: total_sum = 4*atan(1._${k1}$), tolerance = epsilon(1._${k1}$)*100
+        real(${k1}$) :: err(ncalc)
+        ${t1}$ :: xsum(ncalc), meanval(ncalc)
+
+        allocate(x(n))
+        do i = 1, n
+            x(i) = complex(&
+                sqrt(8*atan(1._${k1}$)*(real(i,kind=${k1}$)-0.5_${k1}$)/real(2*n,kind=${k1}$)**2),&
+                sqrt(8*atan(1._${k1}$)*(real(i+n,kind=${k1}$)-0.5_${k1}$)/real(2*n,kind=${k1}$)**2))
+        end do
+        ! scramble array
+        do i = 1, n
+            call random_number(u) 
+            j = 1 + floor(n*u)
+            call swap( x(i), x(j) )
+        end do
+        
+        err(:) = 0._${k1}$
+        do iter = 1, niter
+            xsum(1) = dot_product(x,x) ! compiler intrinsic
+            xsum(2) = fprod_kahan(x,x) ! chunked Kahan dot_product
+            xsum(3) = fprod(x,x)       ! chunked dot_product
+            err(1:ncalc) = err(1:ncalc) + abs(1._${k1}$-(xsum(1:ncalc)%re+xsum(1:ncalc)%im)/total_sum)
+        end do
+        err(1:ncalc) = err(1:ncalc) / niter 
+        
+        call check(error, all(err(:)<tolerance) , "complex dot_product is not accurate" )
+        if (allocated(error)) return
+    end block
+    #:endfor
+
+end subroutine
     
 end module test_intrinsics
 

From cc9df6164162fe723e95e54ca89f30c3dfa67faf Mon Sep 17 00:00:00 2001
From: jalvesz <jose.alves@transvalor.com>
Date: Sun, 29 Dec 2024 22:13:47 +0100
Subject: [PATCH 06/18] start specs

---
 doc/specs/stdlib_intrinsics.md | 45 ++++++++++++++++++++++++++++++++++
 1 file changed, 45 insertions(+)
 create mode 100644 doc/specs/stdlib_intrinsics.md

diff --git a/doc/specs/stdlib_intrinsics.md b/doc/specs/stdlib_intrinsics.md
new file mode 100644
index 000000000..956d47ffc
--- /dev/null
+++ b/doc/specs/stdlib_intrinsics.md
@@ -0,0 +1,45 @@
+---
+title: intrinsics
+---
+
+# The `stdlib_intrinsics` module
+
+[TOC]
+
+## Introduction
+
+The `stdlib_intrinsics` module provides replacements for some of the well known intrinsic functions found in Fortran compilers for which either a faster and/or more accurate implementation is found which has also proven of interest to the Fortran community.
+
+<!-- -- -- -- -- -- -- -- -- -- -- -- -- -- -- -- -- -- -- -->
+### `fsum` function
+
+#### Description
+
+The `fsum` function can replace the intrinsic `sum` for 1D `real` or `complex` arrays. It follows a chunked implementation which maximaxes vectorization potential as well as reducing the round-off error.
+
+#### Syntax
+
+`res = ` [[stdlib_intrinsics(module):fsum(interface)]] ` (x [,mask] )`
+
+#### Status
+
+Experimental
+
+#### Class
+
+Pure function.
+
+#### Argument(s)
+
+`x`: 1D array of either `real` or `complex` type. This argument is `intent(in)`.
+`mask`: 1D array of `logical` values. This argument is `intent(in)`.
+
+#### Output value or Result value
+
+The output is a scalar of `type` and `kind` same as to that of `x`.
+
+#### Example
+
+```fortran
+{!example/math/example_intrinsics_sum.f90!}
+```

From 75945f1234f1cba1d4a89ee5b870ad056bb536fc Mon Sep 17 00:00:00 2001
From: jalvesz <jose.alves@transvalor.com>
Date: Thu, 2 Jan 2025 22:15:14 +0100
Subject: [PATCH 07/18] split into submodules

---
 src/CMakeLists.txt                     |   2 +
 src/stdlib_constants.fypp              |  11 ++
 src/stdlib_intrinsics.fypp             | 213 +++++--------------------
 src/stdlib_intrinsics_dot_product.fypp |  75 +++++++++
 src/stdlib_intrinsics_sum.fypp         | 118 ++++++++++++++
 5 files changed, 246 insertions(+), 173 deletions(-)
 create mode 100644 src/stdlib_intrinsics_dot_product.fypp
 create mode 100644 src/stdlib_intrinsics_sum.fypp

diff --git a/src/CMakeLists.txt b/src/CMakeLists.txt
index 9b52321f5..350db423a 100644
--- a/src/CMakeLists.txt
+++ b/src/CMakeLists.txt
@@ -16,6 +16,8 @@ set(fppFiles
     stdlib_hash_64bit_fnv.fypp
     stdlib_hash_64bit_pengy.fypp
     stdlib_hash_64bit_spookyv2.fypp
+    stdlib_intrinsics_dot_product.fypp
+    stdlib_intrinsics_sum.fypp
     stdlib_intrinsics.fypp
     stdlib_io.fypp
     stdlib_io_npy.fypp
diff --git a/src/stdlib_constants.fypp b/src/stdlib_constants.fypp
index e169dbb3b..61c7d6c90 100644
--- a/src/stdlib_constants.fypp
+++ b/src/stdlib_constants.fypp
@@ -1,5 +1,8 @@
 #:include "common.fypp"
 #:set KINDS = REAL_KINDS
+#:set R_KINDS_TYPES = list(zip(REAL_KINDS, REAL_TYPES, REAL_SUFFIX))
+#:set C_KINDS_TYPES = list(zip(CMPLX_KINDS, CMPLX_TYPES, CMPLX_SUFFIX))
+
 module stdlib_constants
     !! Constants
     !! ([Specification](../page/specs/stdlib_constants.html))
@@ -60,5 +63,13 @@ module stdlib_constants
     real(dp), parameter, public :: u = ATOMIC_MASS_CONSTANT%value !! Atomic mass constant
 
     ! Additional constants if needed
+    #:for k, t, s in R_KINDS_TYPES
+    ${t}$, parameter, public :: zero_${s}$ = 0._${k}$
+    ${t}$, parameter, public :: one_${s}$  = 1._${k}$
+    #:endfor
+    #:for k, t, s in C_KINDS_TYPES
+    ${t}$, parameter, public :: zero_${s}$ = (0._${k}$,0._${k}$)
+    ${t}$, parameter, public :: one_${s}$  = (1._${k}$,0._${k}$)
+    #:endfor
 
 end module stdlib_constants
diff --git a/src/stdlib_intrinsics.fypp b/src/stdlib_intrinsics.fypp
index dcbca0e70..0a4d3244a 100644
--- a/src/stdlib_intrinsics.fypp
+++ b/src/stdlib_intrinsics.fypp
@@ -3,14 +3,6 @@
 #:set C_KINDS_TYPES = list(zip(CMPLX_KINDS, CMPLX_TYPES, CMPLX_SUFFIX))
 #:set RC_KINDS_TYPES = R_KINDS_TYPES + C_KINDS_TYPES
 
-#:def cnjg(type,expression)
-#:if 'complex' in type
-conjg(${expression}$)
-#:else
-${expression}$
-#:endif
-#:enddef
-
 ! This module is based on https://github.com/jalvesz/fast_math
 module stdlib_intrinsics
     !!Replacement for certain Fortran intrinsic functions offering either faster and/or more accurate implementations.
@@ -21,30 +13,52 @@ module stdlib_intrinsics
 
     interface fsum 
         #:for rk, rt, rs in RC_KINDS_TYPES
-        module procedure :: fsum_1d_${rs}$
-        module procedure :: fsum_1d_${rs}$_mask
+        pure module function fsum_1d_${rs}$(a) result(s)
+            ${rt}$, intent(in) :: a(:)
+            ${rt}$ :: s
+        end function
+        pure module function fsum_1d_${rs}$_mask(a,mask) result(s)
+            ${rt}$, intent(in) :: a(:)
+            logical, intent(in) :: mask(:)
+            ${rt}$ :: s
+        end function
         #:endfor
     end interface
     public :: fsum
 
     interface fsum_kahan
         #:for rk, rt, rs in RC_KINDS_TYPES
-        module procedure :: fsum_kahan_1d_${rs}$
-        module procedure :: fsum_kahan_1d_${rs}$_mask
+        pure module function fsum_kahan_1d_${rs}$(a) result(s)
+            ${rt}$, intent(in) :: a(:)
+            ${rt}$ :: s
+        end function
+        pure module function fsum_kahan_1d_${rs}$_mask(a,mask) result(s)
+            ${rt}$, intent(in) :: a(:)
+            logical, intent(in) :: mask(:)
+            ${rt}$ :: s
+        end function
         #:endfor
     end interface
     public :: fsum_kahan
 
     interface fprod
         #:for rk, rt, rs in RC_KINDS_TYPES
-        module procedure :: fprod_${rs}$
+        pure module function fprod_${rs}$(a,b) result(p)
+            ${rt}$, intent(in) :: a(:)
+            ${rt}$, intent(in) :: b(:)
+            ${rt}$ :: p
+        end function
         #:endfor
     end interface
     public :: fprod
 
     interface fprod_kahan
         #:for rk, rt, rs in RC_KINDS_TYPES
-        module procedure :: fprod_kahan_${rs}$
+        pure module function fprod_kahan_${rs}$(a,b) result(p)
+            ${rt}$, intent(in) :: a(:)
+            ${rt}$, intent(in) :: b(:)
+            ${rt}$ :: p
+        end function
         #:endfor
     end interface
     public :: fprod_kahan
@@ -55,174 +69,27 @@ module stdlib_intrinsics
         module procedure :: vkahan_m_${rs}$
         #:endfor
     end interface
-
-    #:for k1, t1, s1 in (R_KINDS_TYPES)
-    ${t1}$, parameter :: zero_${s1}$ = 0._${k1}$
-    #:endfor
-    #:for k1, t1, s1 in (C_KINDS_TYPES)
-    ${t1}$, parameter :: zero_${s1}$ = (0._${k1}$,0._${k1}$)
-    #:endfor
-    integer, parameter :: chunk = 64
+    public :: vkahan
     
 contains
 
-#:for k1, t1, s1 in RC_KINDS_TYPES
-pure function fsum_1d_${s1}$(a) result(s)
-    ${t1}$, intent(in) :: a(:)
-    ${t1}$ :: s
-    ${t1}$ :: abatch(chunk)
-    integer :: i, dr, rr
-    ! -----------------------------
-    dr = size(a)/chunk
-    rr = size(a) - dr*chunk
-
-    abatch = zero_${s1}$
-    do i = 1, dr
-      abatch(1:chunk) = abatch(1:chunk) + a(chunk*i-chunk+1:chunk*i)
-    end do
-    abatch(1:rr) = abatch(1:rr) + a(size(a)-rr+1:size(a))
-
-    s = zero_${s1}$
-    do i = 1, chunk/2
-      s = s + abatch(i)+abatch(chunk/2+i)
-    end do
-end function
-
-pure function fsum_1d_${s1}$_mask(a,mask) result(s)
-    ${t1}$, intent(in) :: a(:)
-    logical, intent(in) :: mask(:)
-    ${t1}$ :: s
-    ${t1}$ :: abatch(chunk)
-    integer :: i, dr, rr
-    ! -----------------------------
-    dr = size(a)/chunk
-    rr = size(a) - dr*chunk
-
-    abatch = zero_${s1}$
-    do i = 1, dr
-      abatch(1:chunk) = abatch(1:chunk) + merge( zero_${s1}$ , a(chunk*i-chunk+1:chunk*i) , mask(chunk*i-chunk+1:chunk*i) )
-    end do
-    abatch(1:rr) = abatch(1:rr) + merge( zero_${s1}$ , a(size(a)-rr+1:size(a)) , mask(size(a)-rr+1:size(a)) )
-    
-    s = zero_${s1}$
-    do i = 1, chunk/2
-        s = s + abatch(i)+abatch(chunk/2+i)
-    end do
-end function
-#:endfor
-
-#:for k1, t1, s1 in RC_KINDS_TYPES
-pure function fsum_kahan_1d_${s1}$(a) result(s)
-    ${t1}$, intent(in) :: a(:)
-    ${t1}$ :: s
-    ${t1}$ :: sbatch(chunk)
-    ${t1}$ :: cbatch(chunk)
-    integer :: i, dr, rr
-    ! -----------------------------
-    dr = size(a)/(chunk)
-    rr = size(a) - dr*chunk     
-    sbatch = zero_${s1}$
-    cbatch = zero_${s1}$
-    do i = 1, dr
-        call vkahan( a(chunk*i-chunk+1:chunk*i) , sbatch(1:chunk) , cbatch(1:chunk) )
-    end do
-    call vkahan( a(size(a)-rr+1:size(a)) , sbatch(1:rr) , cbatch(1:rr) )      
-
-    s = zero_${s1}$
-    do i = 1,chunk
-        call vkahan( sbatch(i) , s , cbatch(i) )
-    end do
-end function
-
-pure function fsum_kahan_1d_${s1}$_mask(a,mask) result(s)
-    ${t1}$, intent(in) :: a(:)
-    logical, intent(in) :: mask(:)
-    ${t1}$ :: s
-    ${t1}$ :: sbatch(chunk)
-    ${t1}$ :: cbatch(chunk)
-    integer :: i, dr, rr
-    ! -----------------------------
-    dr = size(a)/(chunk)
-    rr = size(a) - dr*chunk     
-    sbatch = zero_${s1}$
-    cbatch = zero_${s1}$
-    do i = 1, dr
-        call vkahan( a(chunk*i-chunk+1:chunk*i) , sbatch(1:chunk) , cbatch(1:chunk) , mask(chunk*i-chunk+1:chunk*i) )
-    end do
-    call vkahan( a(size(a)-rr+1:size(a)) , sbatch(1:rr) , cbatch(1:rr) , mask(size(a)-rr+1:size(a)) )
-
-    s = zero_${s1}$
-    do i = 1,chunk
-        call vkahan( sbatch(i) , s , cbatch(i) )
-    end do
-end function
-#:endfor
-
-#:for k1, t1, s1 in RC_KINDS_TYPES
-pure function fprod_${s1}$(a,b) result(p)
-    ${t1}$, intent(in) :: a(:)
-    ${t1}$, intent(in) :: b(:)
-    ${t1}$ :: p
-    ${t1}$ :: abatch(chunk)
-    integer :: i, dr, rr
-    ! -----------------------------
-    dr = size(a)/chunk
-    rr = size(a) - dr*chunk
-
-    abatch = zero_${s1}$
-    do i = 1, dr
-      abatch(1:chunk) = abatch(1:chunk) + a(chunk*i-chunk+1:chunk*i)*${cnjg(t1,'b(chunk*i-chunk+1:chunk*i)')}$
-    end do
-    abatch(1:rr) = abatch(1:rr) + a(size(a)-rr+1:size(a))*${cnjg(t1,'b(size(a)-rr+1:size(a))')}$
-
-    p = zero_${s1}$
-    do i = 1, chunk/2
-      p = p + abatch(i)+abatch(chunk/2+i)
-    end do
-end function
-
-pure function fprod_kahan_${s1}$(a,b) result(p)
-    ${t1}$, intent(in) :: a(:)
-    ${t1}$, intent(in) :: b(:)
-    ${t1}$ :: p
-    ${t1}$ :: pbatch(chunk)
-    ${t1}$ :: cbatch(chunk)
-    integer :: i, dr, rr
-    ! -----------------------------
-    dr = size(a)/(chunk)
-    rr = size(a) - dr*chunk     
-    pbatch = zero_${s1}$
-    cbatch = zero_${s1}$
-    do i = 1, dr
-        call vkahan( a(chunk*i-chunk+1:chunk*i)*${cnjg(t1,'b(chunk*i-chunk+1:chunk*i)')}$ , pbatch(1:chunk) , cbatch(1:chunk) )
-    end do
-    call vkahan( a(size(a)-rr+1:size(a))*${cnjg(t1,'b(size(a)-rr+1:size(a))')}$ , pbatch(1:rr) , cbatch(1:rr) )      
-
-    p = zero_${s1}$
-    do i = 1,chunk
-        call vkahan( pbatch(i) , p , cbatch(i) )
-    end do
-end function
-
-#:endfor
-
-#:for k1, t1, s1 in RC_KINDS_TYPES
-elemental subroutine vkahan_${s1}$(a,s,c)
-    ${t1}$, intent(in) :: a
-    ${t1}$, intent(inout) :: s
-    ${t1}$, intent(inout) :: c
-    ${t1}$ :: t, y
+#:for rk, rt, rs in RC_KINDS_TYPES
+elemental subroutine vkahan_${rs}$(a,s,c)
+    ${rt}$, intent(in) :: a
+    ${rt}$, intent(inout) :: s
+    ${rt}$, intent(inout) :: c
+    ${rt}$ :: t, y
     y = a - c
     t = s + y
     c = (t - s) - y
     s = t
 end subroutine  
-elemental subroutine vkahan_m_${s1}$(a,s,c,m)
-    ${t1}$, intent(in) :: a
-    ${t1}$, intent(inout) :: s
-    ${t1}$, intent(inout) :: c
+elemental subroutine vkahan_m_${rs}$(a,s,c,m)
+    ${rt}$, intent(in) :: a
+    ${rt}$, intent(inout) :: s
+    ${rt}$, intent(inout) :: c
     logical, intent(in) :: m
-    ${t1}$ :: t, y
+    ${rt}$ :: t, y
     y = a - c
     t = s + y
     c = (t - s) - y
diff --git a/src/stdlib_intrinsics_dot_product.fypp b/src/stdlib_intrinsics_dot_product.fypp
new file mode 100644
index 000000000..045d512b2
--- /dev/null
+++ b/src/stdlib_intrinsics_dot_product.fypp
@@ -0,0 +1,75 @@
+#:include "common.fypp"
+#:set R_KINDS_TYPES = list(zip(REAL_KINDS, REAL_TYPES, REAL_SUFFIX))
+#:set C_KINDS_TYPES = list(zip(CMPLX_KINDS, CMPLX_TYPES, CMPLX_SUFFIX))
+#:set RC_KINDS_TYPES = R_KINDS_TYPES + C_KINDS_TYPES
+
+#:def cnjg(type,expression)
+#:if 'complex' in type
+conjg(${expression}$)
+#:else
+${expression}$
+#:endif
+#:enddef
+
+! This module is based on https://github.com/jalvesz/fast_math
+submodule(stdlib_intrinsics) stdlib_intrinsics_dot_product
+    !!Replacement for certain Fortran intrinsic functions offering either faster and/or more accurate implementations.
+    !! ([Specification](../page/specs/stdlib_intrinsics.html))
+    use stdlib_kinds
+    use stdlib_constants
+    implicit none
+
+    integer, parameter :: chunk = 64
+    
+contains
+
+#:for k1, t1, s1 in RC_KINDS_TYPES
+pure module function fprod_${s1}$(a,b) result(p)
+    ${t1}$, intent(in) :: a(:)
+    ${t1}$, intent(in) :: b(:)
+    ${t1}$ :: p
+    ${t1}$ :: abatch(chunk)
+    integer :: i, dr, rr
+    ! -----------------------------
+    dr = size(a)/chunk
+    rr = size(a) - dr*chunk
+
+    abatch = zero_${s1}$
+    do i = 1, dr
+        abatch(1:chunk) = abatch(1:chunk) + a(chunk*i-chunk+1:chunk*i)*${cnjg(t1,'b(chunk*i-chunk+1:chunk*i)')}$
+    end do
+    abatch(1:rr) = abatch(1:rr) + a(size(a)-rr+1:size(a))*${cnjg(t1,'b(size(a)-rr+1:size(a))')}$
+
+    p = zero_${s1}$
+    do i = 1, chunk/2
+        p = p + abatch(i)+abatch(chunk/2+i)
+    end do
+end function
+#:endfor
+
+#:for k1, t1, s1 in RC_KINDS_TYPES
+pure module function fprod_kahan_${s1}$(a,b) result(p)
+    ${t1}$, intent(in) :: a(:)
+    ${t1}$, intent(in) :: b(:)
+    ${t1}$ :: p
+    ${t1}$ :: pbatch(chunk)
+    ${t1}$ :: cbatch(chunk)
+    integer :: i, dr, rr
+    ! -----------------------------
+    dr = size(a)/(chunk)
+    rr = size(a) - dr*chunk     
+    pbatch = zero_${s1}$
+    cbatch = zero_${s1}$
+    do i = 1, dr
+        call vkahan( a(chunk*i-chunk+1:chunk*i)*${cnjg(t1,'b(chunk*i-chunk+1:chunk*i)')}$ , pbatch(1:chunk) , cbatch(1:chunk) )
+    end do
+    call vkahan( a(size(a)-rr+1:size(a))*${cnjg(t1,'b(size(a)-rr+1:size(a))')}$ , pbatch(1:rr) , cbatch(1:rr) )      
+
+    p = zero_${s1}$
+    do i = 1,chunk
+        call vkahan( pbatch(i) , p , cbatch(i) )
+    end do      
+end function
+#:endfor
+
+end submodule stdlib_intrinsics_dot_product
diff --git a/src/stdlib_intrinsics_sum.fypp b/src/stdlib_intrinsics_sum.fypp
new file mode 100644
index 000000000..e189ca897
--- /dev/null
+++ b/src/stdlib_intrinsics_sum.fypp
@@ -0,0 +1,118 @@
+#:include "common.fypp"
+#:set R_KINDS_TYPES = list(zip(REAL_KINDS, REAL_TYPES, REAL_SUFFIX))
+#:set C_KINDS_TYPES = list(zip(CMPLX_KINDS, CMPLX_TYPES, CMPLX_SUFFIX))
+#:set RC_KINDS_TYPES = R_KINDS_TYPES + C_KINDS_TYPES
+
+#:def cnjg(type,expression)
+#:if 'complex' in type
+conjg(${expression}$)
+#:else
+${expression}$
+#:endif
+#:enddef
+
+! This module is based on https://github.com/jalvesz/fast_math
+submodule(stdlib_intrinsics) stdlib_intrinsics_sum
+    !!Replacement for certain Fortran intrinsic functions offering either faster and/or more accurate implementations.
+    !! ([Specification](../page/specs/stdlib_intrinsics.html))
+    use stdlib_kinds
+    use stdlib_constants
+    implicit none
+
+    integer, parameter :: chunk = 64
+    
+contains
+
+#:for rk, rt, rs in RC_KINDS_TYPES
+pure module function fsum_1d_${rs}$(a) result(s)
+    ${rt}$, intent(in) :: a(:)
+    ${rt}$ :: s
+    ${rt}$ :: abatch(chunk)
+    integer :: i, dr, rr
+    ! -----------------------------
+    dr = size(a)/chunk
+    rr = size(a) - dr*chunk
+
+    abatch = zero_${rs}$
+    do i = 1, dr
+      abatch(1:chunk) = abatch(1:chunk) + a(chunk*i-chunk+1:chunk*i)
+    end do
+    abatch(1:rr) = abatch(1:rr) + a(size(a)-rr+1:size(a))
+
+    s = zero_${rs}$
+    do i = 1, chunk/2
+      s = s + abatch(i)+abatch(chunk/2+i)
+    end do
+end function
+
+pure module function fsum_1d_${rs}$_mask(a,mask) result(s)
+    ${rt}$, intent(in) :: a(:)
+    logical, intent(in) :: mask(:)
+    ${rt}$ :: s
+    ${rt}$ :: abatch(chunk)
+    integer :: i, dr, rr
+    ! -----------------------------
+    dr = size(a)/chunk
+    rr = size(a) - dr*chunk
+
+    abatch = zero_${rs}$
+    do i = 1, dr
+      abatch(1:chunk) = abatch(1:chunk) + merge( zero_${rs}$ , a(chunk*i-chunk+1:chunk*i) , mask(chunk*i-chunk+1:chunk*i) )
+    end do
+    abatch(1:rr) = abatch(1:rr) + merge( zero_${rs}$ , a(size(a)-rr+1:size(a)) , mask(size(a)-rr+1:size(a)) )
+    
+    s = zero_${rs}$
+    do i = 1, chunk/2
+        s = s + abatch(i)+abatch(chunk/2+i)
+    end do
+end function
+#:endfor
+
+#:for rk, rt, rs in RC_KINDS_TYPES
+pure module function fsum_kahan_1d_${rs}$(a) result(s)
+    ${rt}$, intent(in) :: a(:)
+    ${rt}$ :: s
+    ${rt}$ :: sbatch(chunk)
+    ${rt}$ :: cbatch(chunk)
+    integer :: i, dr, rr
+    ! -----------------------------
+    dr = size(a)/(chunk)
+    rr = size(a) - dr*chunk     
+    sbatch = zero_${rs}$
+    cbatch = zero_${rs}$
+    do i = 1, dr
+        call vkahan( a(chunk*i-chunk+1:chunk*i) , sbatch(1:chunk) , cbatch(1:chunk) )
+    end do
+    call vkahan( a(size(a)-rr+1:size(a)) , sbatch(1:rr) , cbatch(1:rr) )      
+
+    s = zero_${rs}$
+    do i = 1,chunk
+        call vkahan( sbatch(i) , s , cbatch(i) )
+    end do
+end function
+
+pure module function fsum_kahan_1d_${rs}$_mask(a,mask) result(s)
+    ${rt}$, intent(in) :: a(:)
+    logical, intent(in) :: mask(:)
+    ${rt}$ :: s
+    ${rt}$ :: sbatch(chunk)
+    ${rt}$ :: cbatch(chunk)
+    integer :: i, dr, rr
+    ! -----------------------------
+    dr = size(a)/(chunk)
+    rr = size(a) - dr*chunk     
+    sbatch = zero_${rs}$
+    cbatch = zero_${rs}$
+    do i = 1, dr
+        call vkahan( a(chunk*i-chunk+1:chunk*i) , sbatch(1:chunk) , cbatch(1:chunk) , mask(chunk*i-chunk+1:chunk*i) )
+    end do
+    call vkahan( a(size(a)-rr+1:size(a)) , sbatch(1:rr) , cbatch(1:rr) , mask(size(a)-rr+1:size(a)) )
+
+    s = zero_${rs}$
+    do i = 1,chunk
+        call vkahan( sbatch(i) , s , cbatch(i) )
+    end do
+end function
+#:endfor
+
+end submodule stdlib_intrinsics_sum

From d05903f0a7b6a2d010175c3bafe462f926291d12 Mon Sep 17 00:00:00 2001
From: jalvesz <jose.alves@transvalor.com>
Date: Thu, 2 Jan 2025 22:58:15 +0100
Subject: [PATCH 08/18] specs and examples

---
 doc/specs/stdlib_intrinsics.md             | 117 ++++++++++++++++++++-
 example/CMakeLists.txt                     |   1 +
 example/intrinsics/CMakeLists.txt          |   2 +
 example/intrinsics/example_dot_product.f90 |  18 ++++
 example/intrinsics/example_sum.f90         |  17 +++
 5 files changed, 153 insertions(+), 2 deletions(-)
 create mode 100644 example/intrinsics/CMakeLists.txt
 create mode 100644 example/intrinsics/example_dot_product.f90
 create mode 100644 example/intrinsics/example_sum.f90

diff --git a/doc/specs/stdlib_intrinsics.md b/doc/specs/stdlib_intrinsics.md
index 956d47ffc..099d6ccb4 100644
--- a/doc/specs/stdlib_intrinsics.md
+++ b/doc/specs/stdlib_intrinsics.md
@@ -15,7 +15,7 @@ The `stdlib_intrinsics` module provides replacements for some of the well known
 
 #### Description
 
-The `fsum` function can replace the intrinsic `sum` for 1D `real` or `complex` arrays. It follows a chunked implementation which maximaxes vectorization potential as well as reducing the round-off error.
+The `fsum` function can replace the intrinsic `sum` for 1D `real` or `complex` arrays. It follows a chunked implementation which maximizes vectorization potential as well as reducing the round-off error. This procedure is recommended when summing large arrays, for repetitive summation of smaller arrays consider the classical `sum`.
 
 #### Syntax
 
@@ -32,7 +32,7 @@ Pure function.
 #### Argument(s)
 
 `x`: 1D array of either `real` or `complex` type. This argument is `intent(in)`.
-`mask`: 1D array of `logical` values. This argument is `intent(in)`.
+`mask` (optional): 1D array of `logical` values. This argument is `intent(in)`.
 
 #### Output value or Result value
 
@@ -43,3 +43,116 @@ The output is a scalar of `type` and `kind` same as to that of `x`.
 ```fortran
 {!example/math/example_intrinsics_sum.f90!}
 ```
+
+<!-- -- -- -- -- -- -- -- -- -- -- -- -- -- -- -- -- -- -- -->
+### `fsum_kahan` function
+
+#### Description
+
+The `fsum_kahan` function can replace the intrinsic `sum` for 1D `real` or `complex` arrays. It follows a chunked implementation which maximizes vectorization potential, complemented by an `elemental` kernel based on the [kahan summation](https://en.wikipedia.org/wiki/Kahan_summation_algorithm) strategy to reduce the round-off error:
+
+```fortran
+elemental subroutine vkahan_<kind>(a,s,c)
+    type(<kind>), intent(in) :: a
+    type(<kind>), intent(inout) :: s
+    type(<kind>), intent(inout) :: c
+    type(<kind>) :: t, y
+    y = a - c
+    t = s + y
+    c = (t - s) - y
+    s = t
+end subroutine
+```
+
+#### Syntax
+
+`res = ` [[stdlib_intrinsics(module):fsum_kahan(interface)]] ` (x [,mask] )`
+
+#### Status
+
+Experimental
+
+#### Class
+
+Pure function.
+
+#### Argument(s)
+
+`x`: 1D array of either `real` or `complex` type. This argument is `intent(in)`.
+`mask` (optional): 1D array of `logical` values. This argument is `intent(in)`.
+
+#### Output value or Result value
+
+The output is a scalar of `type` and `kind` same as to that of `x`.
+
+#### Example
+
+```fortran
+{!example/math/example_intrinsics_sum.f90!}
+```
+
+<!-- -- -- -- -- -- -- -- -- -- -- -- -- -- -- -- -- -- -- -->
+### `fprod` function
+
+#### Description
+
+The `fprod` function can replace the intrinsic `dot_product` for 1D `real` or `complex` arrays. It follows a chunked implementation which maximizes vectorization potential as well as reducing the round-off error. This procedure is recommended when crunching large arrays, for repetitive products of smaller arrays consider the classical `dot_product`.
+
+#### Syntax
+
+`res = ` [[stdlib_intrinsics(module):fprod(interface)]] ` (x, y)`
+
+#### Status
+
+Experimental
+
+#### Class
+
+Pure function.
+
+#### Argument(s)
+
+`x`: 1D array of either `real` or `complex` type. This argument is `intent(in)`.
+`y`: 1D array of the same type and kind as `x`. This argument is `intent(in)`.
+
+#### Output value or Result value
+
+The output is a scalar of `type` and `kind` same as to that of `x` and `y`.
+
+#### Example
+
+```fortran
+{!example/math/example_intrinsics_dot_duct.f90!}
+```
+
+<!-- -- -- -- -- -- -- -- -- -- -- -- -- -- -- -- -- -- -- -->
+### `fprod_kahan` function
+
+#### Description
+
+The `fprod_kahan` function can replace the intrinsic `dot_product` for 1D `real` or `complex` arrays. It follows a chunked implementation which maximizes vectorization potential , complemented by the same `elemental` kernel based on the [kahan summation](https://en.wikipedia.org/wiki/Kahan_summation_algorithm) used for `fsum` to reduce the round-off error.
+
+#### Syntax
+
+`res = ` [[stdlib_intrinsics(module):fprod_kahan(interface)]] ` (x, y)`
+
+#### Status
+
+Experimental
+
+#### Class
+
+Pure function.
+
+#### Argument(s)
+
+`x`: 1D array of either `real` or `complex` type. This argument is `intent(in)`.
+`y`: 1D array of the same type and kind as `x`. This argument is `intent(in)`.
+
+#### Output value or Result value
+
+The output is a scalar of `type` and `kind` same as to that of `x` and `y`.
+
+```fortran
+{!example/math/example_intrinsics_dot_duct.f90!}
+```
\ No newline at end of file
diff --git a/example/CMakeLists.txt b/example/CMakeLists.txt
index cbef7f075..968a048b0 100644
--- a/example/CMakeLists.txt
+++ b/example/CMakeLists.txt
@@ -13,6 +13,7 @@ add_subdirectory(constants)
 add_subdirectory(error)
 add_subdirectory(hashmaps)
 add_subdirectory(hash_procedures)
+add_subdirectory(intrinsics)
 add_subdirectory(io)
 add_subdirectory(linalg)
 add_subdirectory(logger)
diff --git a/example/intrinsics/CMakeLists.txt b/example/intrinsics/CMakeLists.txt
new file mode 100644
index 000000000..1645ba8a1
--- /dev/null
+++ b/example/intrinsics/CMakeLists.txt
@@ -0,0 +1,2 @@
+ADD_EXAMPLE(sum)
+ADD_EXAMPLE(dot_product)
\ No newline at end of file
diff --git a/example/intrinsics/example_dot_product.f90 b/example/intrinsics/example_dot_product.f90
new file mode 100644
index 000000000..c0d111d83
--- /dev/null
+++ b/example/intrinsics/example_dot_product.f90
@@ -0,0 +1,18 @@
+program example_dot_product
+    use stdlib_kinds, only: sp
+    use stdlib_intrinsics, only: fprod, fprod_kahan
+    implicit none
+
+    real(sp), allocatable :: x(:), y(:)
+    real(sp) :: total_prod(3)
+
+    allocate( x(1000), y(1000) )
+    call random_number(x)
+    call random_number(y)
+
+    total_prod(1) = dot_product(x,y) !> compiler intrinsic
+    total_prod(2) = fprod(x,y)       !> chunked summation over inner product
+    total_prod(3) = fprod_kahan(x,y) !> chunked kahan summation over inner product
+    print *, total_prod(1:3)
+    
+end program example_dot_product
\ No newline at end of file
diff --git a/example/intrinsics/example_sum.f90 b/example/intrinsics/example_sum.f90
new file mode 100644
index 000000000..70c59d954
--- /dev/null
+++ b/example/intrinsics/example_sum.f90
@@ -0,0 +1,17 @@
+program example_sum
+    use stdlib_kinds, only: sp
+    use stdlib_intrinsics, only: fsum, fsum_kahan
+    implicit none
+
+    real(sp), allocatable :: x(:)
+    real(sp) :: total_sum(3)
+
+    allocate( x(1000) )
+    call random_number(x)
+
+    total_sum(1) = sum(x)       !> compiler intrinsic
+    total_sum(2) = fsum(x)      !> chunked summation
+    total_sum(3) = fsum_kahan(x)!> chunked kahan summation
+    print *, total_sum(1:3)
+    
+end program example_sum
\ No newline at end of file

From 7c6e8a436f61c39f5b19b17a8ce8abc31b294fef Mon Sep 17 00:00:00 2001
From: jalvesz <jose.alves@transvalor.com>
Date: Fri, 3 Jan 2025 21:21:16 +0100
Subject: [PATCH 09/18] fix specs

---
 doc/specs/stdlib_intrinsics.md | 20 ++++++--------------
 1 file changed, 6 insertions(+), 14 deletions(-)

diff --git a/doc/specs/stdlib_intrinsics.md b/doc/specs/stdlib_intrinsics.md
index 099d6ccb4..2a7fa3ea4 100644
--- a/doc/specs/stdlib_intrinsics.md
+++ b/doc/specs/stdlib_intrinsics.md
@@ -32,18 +32,13 @@ Pure function.
 #### Argument(s)
 
 `x`: 1D array of either `real` or `complex` type. This argument is `intent(in)`.
+
 `mask` (optional): 1D array of `logical` values. This argument is `intent(in)`.
 
 #### Output value or Result value
 
 The output is a scalar of `type` and `kind` same as to that of `x`.
 
-#### Example
-
-```fortran
-{!example/math/example_intrinsics_sum.f90!}
-```
-
 <!-- -- -- -- -- -- -- -- -- -- -- -- -- -- -- -- -- -- -- -->
 ### `fsum_kahan` function
 
@@ -79,6 +74,7 @@ Pure function.
 #### Argument(s)
 
 `x`: 1D array of either `real` or `complex` type. This argument is `intent(in)`.
+
 `mask` (optional): 1D array of `logical` values. This argument is `intent(in)`.
 
 #### Output value or Result value
@@ -88,7 +84,7 @@ The output is a scalar of `type` and `kind` same as to that of `x`.
 #### Example
 
 ```fortran
-{!example/math/example_intrinsics_sum.f90!}
+{!example/intrinsics/example_sum.f90!}
 ```
 
 <!-- -- -- -- -- -- -- -- -- -- -- -- -- -- -- -- -- -- -- -->
@@ -113,18 +109,13 @@ Pure function.
 #### Argument(s)
 
 `x`: 1D array of either `real` or `complex` type. This argument is `intent(in)`.
+
 `y`: 1D array of the same type and kind as `x`. This argument is `intent(in)`.
 
 #### Output value or Result value
 
 The output is a scalar of `type` and `kind` same as to that of `x` and `y`.
 
-#### Example
-
-```fortran
-{!example/math/example_intrinsics_dot_duct.f90!}
-```
-
 <!-- -- -- -- -- -- -- -- -- -- -- -- -- -- -- -- -- -- -- -->
 ### `fprod_kahan` function
 
@@ -147,6 +138,7 @@ Pure function.
 #### Argument(s)
 
 `x`: 1D array of either `real` or `complex` type. This argument is `intent(in)`.
+
 `y`: 1D array of the same type and kind as `x`. This argument is `intent(in)`.
 
 #### Output value or Result value
@@ -154,5 +146,5 @@ Pure function.
 The output is a scalar of `type` and `kind` same as to that of `x` and `y`.
 
 ```fortran
-{!example/math/example_intrinsics_dot_duct.f90!}
+{!example/intrinsics/example_dot_product.f90!}
 ```
\ No newline at end of file

From 7cea1fd43e27da4a23799166043182bf7463f076 Mon Sep 17 00:00:00 2001
From: jalvesz <jose.alves@transvalor.com>
Date: Fri, 3 Jan 2025 22:11:54 +0100
Subject: [PATCH 10/18] fix test: complex initialization

---
 test/intrinsics/test_intrinsics.fypp | 4 ++--
 1 file changed, 2 insertions(+), 2 deletions(-)

diff --git a/test/intrinsics/test_intrinsics.fypp b/test/intrinsics/test_intrinsics.fypp
index f83f60759..8969d3afb 100644
--- a/test/intrinsics/test_intrinsics.fypp
+++ b/test/intrinsics/test_intrinsics.fypp
@@ -90,7 +90,7 @@ subroutine test_sum(error)
 
         allocate(x(n))
         do i = 1, n
-            x(i) = complex(&
+            x(i) = cmplx(&
                 8*atan(1._${k1}$)*(real(i,kind=${k1}$)-0.5_${k1}$)/real(2*n,kind=${k1}$)**2,&
                 8*atan(1._${k1}$)*(real(i+n,kind=${k1}$)-0.5_${k1}$)/real(2*n,kind=${k1}$)**2)
         end do
@@ -182,7 +182,7 @@ subroutine test_dot_product(error)
 
         allocate(x(n))
         do i = 1, n
-            x(i) = complex(&
+            x(i) = cmplx(&
                 sqrt(8*atan(1._${k1}$)*(real(i,kind=${k1}$)-0.5_${k1}$)/real(2*n,kind=${k1}$)**2),&
                 sqrt(8*atan(1._${k1}$)*(real(i+n,kind=${k1}$)-0.5_${k1}$)/real(2*n,kind=${k1}$)**2))
         end do

From eaffa4a8aa1c6c5374cb6b5bda6ee3b0d048f6d8 Mon Sep 17 00:00:00 2001
From: jalvesz <jose.alves@transvalor.com>
Date: Sat, 4 Jan 2025 19:26:12 +0100
Subject: [PATCH 11/18] fix test: complex assignment caused accuracy loss

---
 test/intrinsics/test_intrinsics.fypp | 99 +++++++++++-----------------
 1 file changed, 37 insertions(+), 62 deletions(-)

diff --git a/test/intrinsics/test_intrinsics.fypp b/test/intrinsics/test_intrinsics.fypp
index 8969d3afb..3f1a3a98f 100644
--- a/test/intrinsics/test_intrinsics.fypp
+++ b/test/intrinsics/test_intrinsics.fypp
@@ -28,7 +28,7 @@ subroutine test_sum(error)
     type(error_type), allocatable, intent(out) :: error
 
     !> Internal parameters and variables
-    integer, parameter :: n = 1e3, ncalc = 3, niter = 100
+    integer, parameter :: n = 1e3, ncalc = 3
     real(sp) :: u
     integer :: iter, i, j
     !====================================================================================
@@ -36,7 +36,7 @@ subroutine test_sum(error)
     block
         ${t1}$, allocatable :: x(:)
         ${t1}$, parameter :: total_sum = 4*atan(1._${k1}$), tolerance = epsilon(1._${k1}$)*100
-        ${t1}$ :: xsum(ncalc), meanval(ncalc), err(ncalc)
+        ${t1}$ :: xsum(ncalc), err(ncalc)
         logical, allocatable :: mask(:), nmask(:)
 
         allocate(x(n))
@@ -54,26 +54,18 @@ subroutine test_sum(error)
             call swap( nmask(i), nmask(j) )
         end do
         
-        err(:) = 0._${k1}$
-        do iter = 1, niter
-            xsum(1) = sum(x)        ! compiler intrinsic
-            xsum(2) = fsum_kahan(x) ! chunked Kahan summation
-            xsum(3) = fsum(x)       ! chunked summation
-            err(1:ncalc) = err(1:ncalc) + abs(1._${k1}$-xsum(1:ncalc)/total_sum)
-        end do
-        err(1:ncalc) = err(1:ncalc) / niter 
+        xsum(1) = sum(x)        ! compiler intrinsic
+        xsum(2) = fsum_kahan(x) ! chunked Kahan summation
+        xsum(3) = fsum(x)       ! chunked summation
+        err(1:ncalc) = abs(1._${k1}$-xsum(1:ncalc)/total_sum)
         
         call check(error, all(err(:)<tolerance) , "real sum is not accurate" )
         if (allocated(error)) return
 
-        err(:) = 0._${k1}$
-        do iter = 1, niter
-            xsum(1) = sum(x,mask)+sum(x,nmask) ! compiler intrinsic
-            xsum(2) = fsum_kahan(x,mask)+fsum_kahan(x,nmask) ! chunked Kahan summation
-            xsum(3) = fsum(x,mask)+fsum(x,nmask) ! chunked summation
-            err(1:ncalc) = err(1:ncalc) + abs(1._${k1}$-xsum(1:ncalc)/total_sum)
-        end do
-        err(1:ncalc) = err(1:ncalc) / niter 
+        xsum(1) = sum(x,mask)+sum(x,nmask) ! compiler intrinsic
+        xsum(2) = fsum_kahan(x,mask)+fsum_kahan(x,nmask) ! chunked Kahan summation
+        xsum(3) = fsum(x,mask)+fsum(x,nmask) ! chunked summation
+        err(1:ncalc) = abs(1._${k1}$-xsum(1:ncalc)/total_sum)
         
         call check(error, all(err(:)<tolerance) , "masked real sum is not accurate" )
         if (allocated(error)) return
@@ -85,15 +77,14 @@ subroutine test_sum(error)
         ${t1}$, allocatable :: x(:)
         real(${k1}$), parameter :: total_sum = 4*atan(1._${k1}$), tolerance = epsilon(1._${k1}$)*100
         real(${k1}$) :: err(ncalc)
-        ${t1}$ :: xsum(ncalc), meanval(ncalc)
+        ${t1}$ :: xsum(ncalc)
         logical, allocatable :: mask(:), nmask(:)
 
         allocate(x(n))
         do i = 1, n
-            x(i) = cmplx(&
-                8*atan(1._${k1}$)*(real(i,kind=${k1}$)-0.5_${k1}$)/real(2*n,kind=${k1}$)**2,&
-                8*atan(1._${k1}$)*(real(i+n,kind=${k1}$)-0.5_${k1}$)/real(2*n,kind=${k1}$)**2)
+            x(i) = (8*atan(1._${k1}$)*(real(i,kind=${k1}$)-0.5_${k1}$)/n**2)*cmplx(1._${k1}$,1._${k1}$)
         end do
+        
         allocate(mask(n),source=.false.); mask(1:n:2) = .true.
         allocate(nmask(n)); nmask = .not.mask
         ! scramble array
@@ -105,26 +96,20 @@ subroutine test_sum(error)
             call swap( nmask(i), nmask(j) )
         end do
         
-        err(:) = 0._${k1}$
-        do iter = 1, niter
-            xsum(1) = sum(x)        ! compiler intrinsic
-            xsum(2) = fsum_kahan(x) ! chunked Kahan summation
-            xsum(3) = fsum(x)       ! chunked summation
-            err(1:ncalc) = err(1:ncalc) + abs(1._${k1}$-(xsum(1:ncalc)%re+xsum(1:ncalc)%im)/total_sum)
-        end do
-        err(1:ncalc) = err(1:ncalc) / niter 
+        xsum(1) = sum(x)        ! compiler intrinsic
+        xsum(2) = fsum_kahan(x) ! chunked Kahan summation
+        xsum(3) = fsum(x)       ! chunked summation
+        err(1:ncalc) = abs(1._${k1}$-(xsum(1:ncalc)%re)/total_sum)
+        
         
         call check(error, all(err(:)<tolerance) , "complex sum is not accurate" )
         if (allocated(error)) return
 
-        err(:) = 0._${k1}$
-        do iter = 1, niter
-            xsum(1) = sum(x,mask)+sum(x,nmask) ! compiler intrinsic
-            xsum(2) = fsum_kahan(x,mask)+fsum_kahan(x,nmask) ! chunked Kahan summation
-            xsum(3) = fsum(x,mask)+fsum(x,nmask) ! chunked summation
-            err(1:ncalc) = err(1:ncalc) + abs(1._${k1}$-(xsum(1:ncalc)%re+xsum(1:ncalc)%im)/total_sum)
-        end do
-        err(1:ncalc) = err(1:ncalc) / niter 
+        xsum(1) = sum(x,mask)+sum(x,nmask) ! compiler intrinsic
+        xsum(2) = fsum_kahan(x,mask)+fsum_kahan(x,nmask) ! chunked Kahan summation
+        xsum(3) = fsum(x,mask)+fsum(x,nmask) ! chunked summation
+        err(1:ncalc) = abs(1._${k1}$-(xsum(1:ncalc)%re)/total_sum)
+        
         
         call check(error, all(err(:)<tolerance) , "complex masked sum is not accurate" )
         if (allocated(error)) return
@@ -138,7 +123,7 @@ subroutine test_dot_product(error)
     type(error_type), allocatable, intent(out) :: error
 
     !> Internal parameters and variables
-    integer, parameter :: n = 1e3, ncalc = 3, niter = 100
+    integer, parameter :: n = 1e3, ncalc = 3
     real(sp) :: u
     integer :: iter, i, j
     !====================================================================================
@@ -146,11 +131,11 @@ subroutine test_dot_product(error)
     block
         ${t1}$, allocatable :: x(:)
         ${t1}$, parameter :: total_sum = 4*atan(1._${k1}$), tolerance = epsilon(1._${k1}$)*100
-        ${t1}$ :: xsum(ncalc), meanval(ncalc), err(ncalc)
+        ${t1}$ :: xsum(ncalc), err(ncalc)
 
         allocate(x(n))
         do i = 1, n 
-            x(i) = sqrt( 8*atan(1._${k1}$)*(real(i,kind=${k1}$)-0.5_${k1}$)/real(n,kind=${k1}$)**2 )
+            x(i) = 2*sqrt( 2*atan(1._${k1}$)*(real(i,kind=${k1}$)-0.5_${k1}$) )/n
         end do
         ! scramble array
         do i = 1, n
@@ -159,14 +144,10 @@ subroutine test_dot_product(error)
             call swap( x(i), x(j) )
         end do
         
-        err(:) = 0._${k1}$
-        do iter = 1, niter
-            xsum(1) = dot_product(x,x) ! compiler intrinsic
-            xsum(2) = fprod_kahan(x,x) ! chunked Kahan summation
-            xsum(3) = fprod(x,x)       ! chunked summation
-            err(1:ncalc) = err(1:ncalc) + abs(1._${k1}$-xsum(1:ncalc)/total_sum)
-        end do
-        err(1:ncalc) = err(1:ncalc) / niter 
+        xsum(1) = dot_product(x,x) ! compiler intrinsic
+        xsum(2) = fprod_kahan(x,x) ! chunked Kahan summation
+        xsum(3) = fprod(x,x)       ! chunked summation
+        err(1:ncalc) = abs(1._${k1}$-xsum(1:ncalc)/total_sum)
         
         call check(error, all(err(:)<tolerance) , "real dot_product is not accurate" )
         if (allocated(error)) return
@@ -178,13 +159,11 @@ subroutine test_dot_product(error)
         ${t1}$, allocatable :: x(:)
         real(${k1}$), parameter :: total_sum = 4*atan(1._${k1}$), tolerance = epsilon(1._${k1}$)*100
         real(${k1}$) :: err(ncalc)
-        ${t1}$ :: xsum(ncalc), meanval(ncalc)
+        ${t1}$ :: xsum(ncalc)
 
         allocate(x(n))
         do i = 1, n
-            x(i) = cmplx(&
-                sqrt(8*atan(1._${k1}$)*(real(i,kind=${k1}$)-0.5_${k1}$)/real(2*n,kind=${k1}$)**2),&
-                sqrt(8*atan(1._${k1}$)*(real(i+n,kind=${k1}$)-0.5_${k1}$)/real(2*n,kind=${k1}$)**2))
+            x(i) = ( 2*sqrt( 2*atan(1._${k1}$)*(real(i,kind=${k1}$)-0.5_${k1}$) ) / n )*cmplx(1._${k1}$,1._${k1}$)
         end do
         ! scramble array
         do i = 1, n
@@ -193,15 +172,11 @@ subroutine test_dot_product(error)
             call swap( x(i), x(j) )
         end do
         
-        err(:) = 0._${k1}$
-        do iter = 1, niter
-            xsum(1) = dot_product(x,x) ! compiler intrinsic
-            xsum(2) = fprod_kahan(x,x) ! chunked Kahan dot_product
-            xsum(3) = fprod(x,x)       ! chunked dot_product
-            err(1:ncalc) = err(1:ncalc) + abs(1._${k1}$-(xsum(1:ncalc)%re+xsum(1:ncalc)%im)/total_sum)
-        end do
-        err(1:ncalc) = err(1:ncalc) / niter 
-        
+        xsum(1) = dot_product(x,x) ! compiler intrinsic
+        xsum(2) = fprod_kahan(x,x) ! chunked Kahan dot_product
+        xsum(3) = fprod(x,x)       ! chunked dot_product
+        err(1:ncalc) = abs(1._${k1}$-xsum(1:ncalc)%re/(2*total_sum))
+         
         call check(error, all(err(:)<tolerance) , "complex dot_product is not accurate" )
         if (allocated(error)) return
     end block

From a3d24e4d17b205e77f5198288f1c4aa23a63415a Mon Sep 17 00:00:00 2001
From: jalvesz <jose.alves@transvalor.com>
Date: Sun, 5 Jan 2025 21:38:35 +0100
Subject: [PATCH 12/18] extend fsum support for ndarrays

---
 src/stdlib_intrinsics.fypp           | 14 ++++++
 src/stdlib_intrinsics_sum.fypp       | 73 ++++++++++++++++++++++++++++
 test/intrinsics/test_intrinsics.fypp | 20 +++++++-
 3 files changed, 106 insertions(+), 1 deletion(-)

diff --git a/src/stdlib_intrinsics.fypp b/src/stdlib_intrinsics.fypp
index 0a4d3244a..1b9604cee 100644
--- a/src/stdlib_intrinsics.fypp
+++ b/src/stdlib_intrinsics.fypp
@@ -2,6 +2,7 @@
 #:set R_KINDS_TYPES = list(zip(REAL_KINDS, REAL_TYPES, REAL_SUFFIX))
 #:set C_KINDS_TYPES = list(zip(CMPLX_KINDS, CMPLX_TYPES, CMPLX_SUFFIX))
 #:set RC_KINDS_TYPES = R_KINDS_TYPES + C_KINDS_TYPES
+#:set RANKS = range(1, MAXRANK + 1)
 
 ! This module is based on https://github.com/jalvesz/fast_math
 module stdlib_intrinsics
@@ -22,6 +23,19 @@ module stdlib_intrinsics
             logical, intent(in) :: mask(:)
             ${rt}$ :: s
         end function
+        #:for rank in RANKS
+        pure module function fsum_${rank}$d_${rs}$( x, mask ) result( s )
+            ${rt}$, intent(in) :: x${ranksuffix(rank)}$
+            logical, intent(in), optional :: mask${ranksuffix(rank)}$
+            ${rt}$ :: s
+        end function
+        pure module function fsum_${rank}$d_dim_${rs}$( x , dim, mask ) result( s )
+            ${rt}$, intent(in) :: x${ranksuffix(rank)}$
+            integer, intent(in):: dim
+            logical, intent(in), optional :: mask${ranksuffix(rank)}$
+            ${rt}$ :: s${reduced_shape('x', rank, 'dim')}$
+        end function
+        #:endfor
         #:endfor
     end interface
     public :: fsum
diff --git a/src/stdlib_intrinsics_sum.fypp b/src/stdlib_intrinsics_sum.fypp
index e189ca897..f1f831be4 100644
--- a/src/stdlib_intrinsics_sum.fypp
+++ b/src/stdlib_intrinsics_sum.fypp
@@ -2,6 +2,7 @@
 #:set R_KINDS_TYPES = list(zip(REAL_KINDS, REAL_TYPES, REAL_SUFFIX))
 #:set C_KINDS_TYPES = list(zip(CMPLX_KINDS, CMPLX_TYPES, CMPLX_SUFFIX))
 #:set RC_KINDS_TYPES = R_KINDS_TYPES + C_KINDS_TYPES
+#:set RANKS = range(1, MAXRANK + 1)
 
 #:def cnjg(type,expression)
 #:if 'complex' in type
@@ -66,6 +67,78 @@ pure module function fsum_1d_${rs}$_mask(a,mask) result(s)
         s = s + abatch(i)+abatch(chunk/2+i)
     end do
 end function
+
+#:for rank in RANKS
+pure module function fsum_${rank}$d_${rs}$( x , mask ) result( s )
+    ${rt}$, intent(in) :: x${ranksuffix(rank)}$
+    logical, intent(in), optional :: mask${ranksuffix(rank)}$
+    ${rt}$ :: s
+    if(.not.present(mask)) then
+        s = sum_recast(x,size(x))
+    else
+        s = sum_recast_mask(x,mask,size(x))
+    end if
+contains
+    pure ${rt}$ function sum_recast(b,n)
+      integer, intent(in) :: n
+      ${rt}$, intent(in) :: b(n)
+      sum_recast = fsum(b)
+    end function
+    pure ${rt}$ function sum_recast_mask(b,m,n)
+      integer, intent(in) :: n
+      ${rt}$, intent(in) :: b(n)
+      logical, intent(in) :: m(n)
+      sum_recast_mask = fsum(b,m)
+    end function
+end function
+
+pure module function fsum_${rank}$d_dim_${rs}$( x , dim, mask ) result( s )
+    ${rt}$, intent(in) :: x${ranksuffix(rank)}$
+    integer, intent(in):: dim
+    logical, intent(in), optional :: mask${ranksuffix(rank)}$
+    ${rt}$ :: s${reduced_shape('x', rank, 'dim')}$
+    integer :: j 
+
+    if(.not.present(mask)) then
+        if(dim<${rank}$)then
+            do j = 1, size(x,dim=${rank}$)
+                #:if rank == 2
+                s${select_subarray(rank-1, [(rank-1, 'j')])}$ = fsum( x${select_subarray(rank, [(rank, 'j')])}$ )
+                #:else
+                s${select_subarray(rank-1, [(rank-1, 'j')])}$ = fsum( x${select_subarray(rank, [(rank, 'j')])}$, dim=dim )
+                #:endif
+            end do
+        else
+            do j = 1, size(x,dim=1)
+                #:if rank == 2
+                s${select_subarray(rank-1, [(1, 'j')])}$ = fsum( x${select_subarray(rank, [(1, 'j')])}$ )
+                #:else
+                s${select_subarray(rank-1, [(1, 'j')])}$ = fsum( x${select_subarray(rank, [(1, 'j')])}$, dim=${rank-1}$ )
+                #:endif
+            end do
+        end if
+    else 
+        if(dim<${rank}$)then
+            do j = 1, size(x,dim=${rank}$)
+                #:if rank == 2
+                s${select_subarray(rank-1, [(rank-1, 'j')])}$ = fsum( x${select_subarray(rank, [(rank, 'j')])}$, mask=mask${select_subarray(rank, [(rank, 'j')])}$ )
+                #:else
+                s${select_subarray(rank-1, [(rank-1, 'j')])}$ = fsum( x${select_subarray(rank, [(rank, 'j')])}$, dim=dim, mask=mask${select_subarray(rank, [(rank, 'j')])}$ )
+                #:endif
+            end do
+        else
+            do j = 1, size(x,dim=1)
+                #:if rank == 2
+                s${select_subarray(rank-1, [(1, 'j')])}$ = fsum( x${select_subarray(rank, [(1, 'j')])}$, mask=mask${select_subarray(rank, [(1, 'j')])}$ )
+                #:else
+                s${select_subarray(rank-1, [(1, 'j')])}$ = fsum( x${select_subarray(rank, [(1, 'j')])}$, dim=${rank-1}$, mask=mask${select_subarray(rank, [(1, 'j')])}$ )
+                #:endif
+            end do
+        end if
+    end if
+
+end function
+#:endfor
 #:endfor
 
 #:for rk, rt, rs in RC_KINDS_TYPES
diff --git a/test/intrinsics/test_intrinsics.fypp b/test/intrinsics/test_intrinsics.fypp
index 3f1a3a98f..ce8ec5c0b 100644
--- a/test/intrinsics/test_intrinsics.fypp
+++ b/test/intrinsics/test_intrinsics.fypp
@@ -110,12 +110,30 @@ subroutine test_sum(error)
         xsum(3) = fsum(x,mask)+fsum(x,nmask) ! chunked summation
         err(1:ncalc) = abs(1._${k1}$-(xsum(1:ncalc)%re)/total_sum)
         
-        
         call check(error, all(err(:)<tolerance) , "complex masked sum is not accurate" )
         if (allocated(error)) return
     end block
     #:endfor
 
+    ndarray : block
+        use stdlib_strings, only: to_string
+        real(sp), allocatable :: x(:,:,:)
+        real(sp), parameter :: tolerance = epsilon(1._sp)*100
+        integer :: i 
+
+        allocate(x(100,100,10))
+        call random_number(x)
+        !> sum all elements
+        call check(error, abs( sum(x) - fsum(x) )<tolerance*size(x) , "KO: full ndarray fsum" )
+        if (allocated(error)) return
+
+        !> sum over specific rank dim
+        do i = 1, rank(x)
+            call check(error, norm2( sum(x,dim=i) - fsum(x,dim=i) )<tolerance*size(x) , "KO: ndarray fsum over dim "//to_string(i) )
+            if (allocated(error)) return
+        end do
+    end block ndarray
+
 end subroutine
 
 subroutine test_dot_product(error)

From 5a1fdcb6fe4c2097dd1752e8cb77b0597460f5ff Mon Sep 17 00:00:00 2001
From: jalvesz <jose.alves@transvalor.com>
Date: Sun, 5 Jan 2025 21:39:44 +0100
Subject: [PATCH 13/18] remove unnecessary definition

---
 src/stdlib_intrinsics_sum.fypp | 8 --------
 1 file changed, 8 deletions(-)

diff --git a/src/stdlib_intrinsics_sum.fypp b/src/stdlib_intrinsics_sum.fypp
index f1f831be4..7c6cbc522 100644
--- a/src/stdlib_intrinsics_sum.fypp
+++ b/src/stdlib_intrinsics_sum.fypp
@@ -4,14 +4,6 @@
 #:set RC_KINDS_TYPES = R_KINDS_TYPES + C_KINDS_TYPES
 #:set RANKS = range(1, MAXRANK + 1)
 
-#:def cnjg(type,expression)
-#:if 'complex' in type
-conjg(${expression}$)
-#:else
-${expression}$
-#:endif
-#:enddef
-
 ! This module is based on https://github.com/jalvesz/fast_math
 submodule(stdlib_intrinsics) stdlib_intrinsics_sum
     !!Replacement for certain Fortran intrinsic functions offering either faster and/or more accurate implementations.

From 47396acf367f01ba06e93664e0346f769436ef33 Mon Sep 17 00:00:00 2001
From: jalvesz <jose.alves@transvalor.com>
Date: Sun, 5 Jan 2025 21:56:50 +0100
Subject: [PATCH 14/18] update specs, change name of kahan kernel

---
 doc/specs/stdlib_intrinsics.md         | 14 +++++++++-----
 src/stdlib_intrinsics.fypp             | 14 +++++++-------
 src/stdlib_intrinsics_dot_product.fypp |  6 +++---
 src/stdlib_intrinsics_sum.fypp         | 14 +++++++-------
 4 files changed, 26 insertions(+), 22 deletions(-)

diff --git a/doc/specs/stdlib_intrinsics.md b/doc/specs/stdlib_intrinsics.md
index 2a7fa3ea4..c9baf619c 100644
--- a/doc/specs/stdlib_intrinsics.md
+++ b/doc/specs/stdlib_intrinsics.md
@@ -15,12 +15,14 @@ The `stdlib_intrinsics` module provides replacements for some of the well known
 
 #### Description
 
-The `fsum` function can replace the intrinsic `sum` for 1D `real` or `complex` arrays. It follows a chunked implementation which maximizes vectorization potential as well as reducing the round-off error. This procedure is recommended when summing large arrays, for repetitive summation of smaller arrays consider the classical `sum`.
+The `fsum` function can replace the intrinsic `sum` for `real` or `complex` arrays. It follows a chunked implementation which maximizes vectorization potential as well as reducing the round-off error. This procedure is recommended when summing large arrays, for repetitive summation of smaller arrays consider the classical `sum`.
 
 #### Syntax
 
 `res = ` [[stdlib_intrinsics(module):fsum(interface)]] ` (x [,mask] )`
 
+`res = ` [[stdlib_intrinsics(module):fsum(interface)]] ` (x, dim [,mask] )`
+
 #### Status
 
 Experimental
@@ -31,13 +33,15 @@ Pure function.
 
 #### Argument(s)
 
-`x`: 1D array of either `real` or `complex` type. This argument is `intent(in)`.
+`x`: N-D array of either `real` or `complex` type. This argument is `intent(in)`.
 
-`mask` (optional): 1D array of `logical` values. This argument is `intent(in)`.
+`dim` (optional): scalar of type `integer` with a value in the range from 1 to n, where n equals the rank of `x`.
+
+`mask` (optional): N-D array of `logical` values, with the same shape as `x`. This argument is `intent(in)`.
 
 #### Output value or Result value
 
-The output is a scalar of `type` and `kind` same as to that of `x`.
+If `dim` is absent, the output is a scalar of the same `type` and `kind` as to that of `x`. Otherwise, an array of rank n-1, where n equals the rank of `x`, and a shape similar to that of `x` with dimension `dim` dropped is returned.
 
 <!-- -- -- -- -- -- -- -- -- -- -- -- -- -- -- -- -- -- -- -->
 ### `fsum_kahan` function
@@ -47,7 +51,7 @@ The output is a scalar of `type` and `kind` same as to that of `x`.
 The `fsum_kahan` function can replace the intrinsic `sum` for 1D `real` or `complex` arrays. It follows a chunked implementation which maximizes vectorization potential, complemented by an `elemental` kernel based on the [kahan summation](https://en.wikipedia.org/wiki/Kahan_summation_algorithm) strategy to reduce the round-off error:
 
 ```fortran
-elemental subroutine vkahan_<kind>(a,s,c)
+elemental subroutine kahan_kernel_<kind>(a,s,c)
     type(<kind>), intent(in) :: a
     type(<kind>), intent(inout) :: s
     type(<kind>), intent(inout) :: c
diff --git a/src/stdlib_intrinsics.fypp b/src/stdlib_intrinsics.fypp
index 1b9604cee..f8da7a876 100644
--- a/src/stdlib_intrinsics.fypp
+++ b/src/stdlib_intrinsics.fypp
@@ -2,7 +2,7 @@
 #:set R_KINDS_TYPES = list(zip(REAL_KINDS, REAL_TYPES, REAL_SUFFIX))
 #:set C_KINDS_TYPES = list(zip(CMPLX_KINDS, CMPLX_TYPES, CMPLX_SUFFIX))
 #:set RC_KINDS_TYPES = R_KINDS_TYPES + C_KINDS_TYPES
-#:set RANKS = range(1, MAXRANK + 1)
+#:set RANKS = range(2, MAXRANK + 1)
 
 ! This module is based on https://github.com/jalvesz/fast_math
 module stdlib_intrinsics
@@ -77,18 +77,18 @@ module stdlib_intrinsics
     end interface
     public :: fprod_kahan
 
-    interface vkahan 
+    interface kahan_kernel 
         #:for rk, rt, rs in RC_KINDS_TYPES
-        module procedure :: vkahan_${rs}$
-        module procedure :: vkahan_m_${rs}$
+        module procedure :: kahan_kernel_${rs}$
+        module procedure :: kahan_kernel_m_${rs}$
         #:endfor
     end interface
-    public :: vkahan
+    public :: kahan_kernel
     
 contains
 
 #:for rk, rt, rs in RC_KINDS_TYPES
-elemental subroutine vkahan_${rs}$(a,s,c)
+elemental subroutine kahan_kernel_${rs}$(a,s,c)
     ${rt}$, intent(in) :: a
     ${rt}$, intent(inout) :: s
     ${rt}$, intent(inout) :: c
@@ -98,7 +98,7 @@ elemental subroutine vkahan_${rs}$(a,s,c)
     c = (t - s) - y
     s = t
 end subroutine  
-elemental subroutine vkahan_m_${rs}$(a,s,c,m)
+elemental subroutine kahan_kernel_m_${rs}$(a,s,c,m)
     ${rt}$, intent(in) :: a
     ${rt}$, intent(inout) :: s
     ${rt}$, intent(inout) :: c
diff --git a/src/stdlib_intrinsics_dot_product.fypp b/src/stdlib_intrinsics_dot_product.fypp
index 045d512b2..73e4fffcc 100644
--- a/src/stdlib_intrinsics_dot_product.fypp
+++ b/src/stdlib_intrinsics_dot_product.fypp
@@ -61,13 +61,13 @@ pure module function fprod_kahan_${s1}$(a,b) result(p)
     pbatch = zero_${s1}$
     cbatch = zero_${s1}$
     do i = 1, dr
-        call vkahan( a(chunk*i-chunk+1:chunk*i)*${cnjg(t1,'b(chunk*i-chunk+1:chunk*i)')}$ , pbatch(1:chunk) , cbatch(1:chunk) )
+        call kahan_kernel( a(chunk*i-chunk+1:chunk*i)*${cnjg(t1,'b(chunk*i-chunk+1:chunk*i)')}$ , pbatch(1:chunk) , cbatch(1:chunk) )
     end do
-    call vkahan( a(size(a)-rr+1:size(a))*${cnjg(t1,'b(size(a)-rr+1:size(a))')}$ , pbatch(1:rr) , cbatch(1:rr) )      
+    call kahan_kernel( a(size(a)-rr+1:size(a))*${cnjg(t1,'b(size(a)-rr+1:size(a))')}$ , pbatch(1:rr) , cbatch(1:rr) )      
 
     p = zero_${s1}$
     do i = 1,chunk
-        call vkahan( pbatch(i) , p , cbatch(i) )
+        call kahan_kernel( pbatch(i) , p , cbatch(i) )
     end do      
 end function
 #:endfor
diff --git a/src/stdlib_intrinsics_sum.fypp b/src/stdlib_intrinsics_sum.fypp
index 7c6cbc522..5fb012384 100644
--- a/src/stdlib_intrinsics_sum.fypp
+++ b/src/stdlib_intrinsics_sum.fypp
@@ -2,7 +2,7 @@
 #:set R_KINDS_TYPES = list(zip(REAL_KINDS, REAL_TYPES, REAL_SUFFIX))
 #:set C_KINDS_TYPES = list(zip(CMPLX_KINDS, CMPLX_TYPES, CMPLX_SUFFIX))
 #:set RC_KINDS_TYPES = R_KINDS_TYPES + C_KINDS_TYPES
-#:set RANKS = range(1, MAXRANK + 1)
+#:set RANKS = range(2, MAXRANK + 1)
 
 ! This module is based on https://github.com/jalvesz/fast_math
 submodule(stdlib_intrinsics) stdlib_intrinsics_sum
@@ -146,13 +146,13 @@ pure module function fsum_kahan_1d_${rs}$(a) result(s)
     sbatch = zero_${rs}$
     cbatch = zero_${rs}$
     do i = 1, dr
-        call vkahan( a(chunk*i-chunk+1:chunk*i) , sbatch(1:chunk) , cbatch(1:chunk) )
+        call kahan_kernel( a(chunk*i-chunk+1:chunk*i) , sbatch(1:chunk) , cbatch(1:chunk) )
     end do
-    call vkahan( a(size(a)-rr+1:size(a)) , sbatch(1:rr) , cbatch(1:rr) )      
+    call kahan_kernel( a(size(a)-rr+1:size(a)) , sbatch(1:rr) , cbatch(1:rr) )      
 
     s = zero_${rs}$
     do i = 1,chunk
-        call vkahan( sbatch(i) , s , cbatch(i) )
+        call kahan_kernel( sbatch(i) , s , cbatch(i) )
     end do
 end function
 
@@ -169,13 +169,13 @@ pure module function fsum_kahan_1d_${rs}$_mask(a,mask) result(s)
     sbatch = zero_${rs}$
     cbatch = zero_${rs}$
     do i = 1, dr
-        call vkahan( a(chunk*i-chunk+1:chunk*i) , sbatch(1:chunk) , cbatch(1:chunk) , mask(chunk*i-chunk+1:chunk*i) )
+        call kahan_kernel( a(chunk*i-chunk+1:chunk*i) , sbatch(1:chunk) , cbatch(1:chunk) , mask(chunk*i-chunk+1:chunk*i) )
     end do
-    call vkahan( a(size(a)-rr+1:size(a)) , sbatch(1:rr) , cbatch(1:rr) , mask(size(a)-rr+1:size(a)) )
+    call kahan_kernel( a(size(a)-rr+1:size(a)) , sbatch(1:rr) , cbatch(1:rr) , mask(size(a)-rr+1:size(a)) )
 
     s = zero_${rs}$
     do i = 1,chunk
-        call vkahan( sbatch(i) , s , cbatch(i) )
+        call kahan_kernel( sbatch(i) , s , cbatch(i) )
     end do
 end function
 #:endfor

From ecb705070fbef96fd3cce20312b6743fe29f39d3 Mon Sep 17 00:00:00 2001
From: jalvesz <jose.alves@transvalor.com>
Date: Tue, 7 Jan 2025 20:06:09 +0100
Subject: [PATCH 15/18] small reorganization

---
 src/stdlib_intrinsics_sum.fypp       | 96 ++++++++++++++--------------
 test/intrinsics/test_intrinsics.fypp |  3 +-
 2 files changed, 51 insertions(+), 48 deletions(-)

diff --git a/src/stdlib_intrinsics_sum.fypp b/src/stdlib_intrinsics_sum.fypp
index 5fb012384..65ab2f574 100644
--- a/src/stdlib_intrinsics_sum.fypp
+++ b/src/stdlib_intrinsics_sum.fypp
@@ -16,6 +16,7 @@ submodule(stdlib_intrinsics) stdlib_intrinsics_sum
     
 contains
 
+!================= 1D Base implementations ============
 #:for rk, rt, rs in RC_KINDS_TYPES
 pure module function fsum_1d_${rs}$(a) result(s)
     ${rt}$, intent(in) :: a(:)
@@ -60,6 +61,54 @@ pure module function fsum_1d_${rs}$_mask(a,mask) result(s)
     end do
 end function
 
+pure module function fsum_kahan_1d_${rs}$(a) result(s)
+    ${rt}$, intent(in) :: a(:)
+    ${rt}$ :: s
+    ${rt}$ :: sbatch(chunk)
+    ${rt}$ :: cbatch(chunk)
+    integer :: i, dr, rr
+    ! -----------------------------
+    dr = size(a)/(chunk)
+    rr = size(a) - dr*chunk     
+    sbatch = zero_${rs}$
+    cbatch = zero_${rs}$
+    do i = 1, dr
+        call kahan_kernel( a(chunk*i-chunk+1:chunk*i) , sbatch(1:chunk) , cbatch(1:chunk) )
+    end do
+    call kahan_kernel( a(size(a)-rr+1:size(a)) , sbatch(1:rr) , cbatch(1:rr) )      
+
+    s = zero_${rs}$
+    do i = 1,chunk
+        call kahan_kernel( sbatch(i) , s , cbatch(i) )
+    end do
+end function
+
+pure module function fsum_kahan_1d_${rs}$_mask(a,mask) result(s)
+    ${rt}$, intent(in) :: a(:)
+    logical, intent(in) :: mask(:)
+    ${rt}$ :: s
+    ${rt}$ :: sbatch(chunk)
+    ${rt}$ :: cbatch(chunk)
+    integer :: i, dr, rr
+    ! -----------------------------
+    dr = size(a)/(chunk)
+    rr = size(a) - dr*chunk     
+    sbatch = zero_${rs}$
+    cbatch = zero_${rs}$
+    do i = 1, dr
+        call kahan_kernel( a(chunk*i-chunk+1:chunk*i) , sbatch(1:chunk) , cbatch(1:chunk) , mask(chunk*i-chunk+1:chunk*i) )
+    end do
+    call kahan_kernel( a(size(a)-rr+1:size(a)) , sbatch(1:rr) , cbatch(1:rr) , mask(size(a)-rr+1:size(a)) )
+
+    s = zero_${rs}$
+    do i = 1,chunk
+        call kahan_kernel( sbatch(i) , s , cbatch(i) )
+    end do
+end function
+#:endfor
+
+!================= N-D implementations ============
+#:for rk, rt, rs in RC_KINDS_TYPES
 #:for rank in RANKS
 pure module function fsum_${rank}$d_${rs}$( x , mask ) result( s )
     ${rt}$, intent(in) :: x${ranksuffix(rank)}$
@@ -133,51 +182,4 @@ end function
 #:endfor
 #:endfor
 
-#:for rk, rt, rs in RC_KINDS_TYPES
-pure module function fsum_kahan_1d_${rs}$(a) result(s)
-    ${rt}$, intent(in) :: a(:)
-    ${rt}$ :: s
-    ${rt}$ :: sbatch(chunk)
-    ${rt}$ :: cbatch(chunk)
-    integer :: i, dr, rr
-    ! -----------------------------
-    dr = size(a)/(chunk)
-    rr = size(a) - dr*chunk     
-    sbatch = zero_${rs}$
-    cbatch = zero_${rs}$
-    do i = 1, dr
-        call kahan_kernel( a(chunk*i-chunk+1:chunk*i) , sbatch(1:chunk) , cbatch(1:chunk) )
-    end do
-    call kahan_kernel( a(size(a)-rr+1:size(a)) , sbatch(1:rr) , cbatch(1:rr) )      
-
-    s = zero_${rs}$
-    do i = 1,chunk
-        call kahan_kernel( sbatch(i) , s , cbatch(i) )
-    end do
-end function
-
-pure module function fsum_kahan_1d_${rs}$_mask(a,mask) result(s)
-    ${rt}$, intent(in) :: a(:)
-    logical, intent(in) :: mask(:)
-    ${rt}$ :: s
-    ${rt}$ :: sbatch(chunk)
-    ${rt}$ :: cbatch(chunk)
-    integer :: i, dr, rr
-    ! -----------------------------
-    dr = size(a)/(chunk)
-    rr = size(a) - dr*chunk     
-    sbatch = zero_${rs}$
-    cbatch = zero_${rs}$
-    do i = 1, dr
-        call kahan_kernel( a(chunk*i-chunk+1:chunk*i) , sbatch(1:chunk) , cbatch(1:chunk) , mask(chunk*i-chunk+1:chunk*i) )
-    end do
-    call kahan_kernel( a(size(a)-rr+1:size(a)) , sbatch(1:rr) , cbatch(1:rr) , mask(size(a)-rr+1:size(a)) )
-
-    s = zero_${rs}$
-    do i = 1,chunk
-        call kahan_kernel( sbatch(i) , s , cbatch(i) )
-    end do
-end function
-#:endfor
-
 end submodule stdlib_intrinsics_sum
diff --git a/test/intrinsics/test_intrinsics.fypp b/test/intrinsics/test_intrinsics.fypp
index ce8ec5c0b..19b6e4358 100644
--- a/test/intrinsics/test_intrinsics.fypp
+++ b/test/intrinsics/test_intrinsics.fypp
@@ -129,7 +129,8 @@ subroutine test_sum(error)
 
         !> sum over specific rank dim
         do i = 1, rank(x)
-            call check(error, norm2( sum(x,dim=i) - fsum(x,dim=i) )<tolerance*size(x) , "KO: ndarray fsum over dim "//to_string(i) )
+            call check(error, norm2( sum(x,dim=i) - fsum(x,dim=i) )<tolerance*size(x) ,&
+                        "KO: ndarray fsum over dim "//to_string(i) )
             if (allocated(error)) return
         end do
     end block ndarray

From 14be9740067666a0aba2a0a089005490dc8cfb80 Mon Sep 17 00:00:00 2001
From: jalvesz <jose.alves@transvalor.com>
Date: Sat, 11 Jan 2025 23:53:11 +0100
Subject: [PATCH 16/18] change names to stdlib_*

---
 doc/specs/stdlib_intrinsics.md             | 26 +++++++++---------
 example/intrinsics/example_dot_product.f90 |  6 ++--
 example/intrinsics/example_sum.f90         |  6 ++--
 src/stdlib_intrinsics.fypp                 | 32 +++++++++++-----------
 src/stdlib_intrinsics_dot_product.fypp     |  4 +--
 src/stdlib_intrinsics_sum.fypp             | 32 +++++++++++-----------
 test/intrinsics/test_intrinsics.fypp       | 30 ++++++++++----------
 7 files changed, 68 insertions(+), 68 deletions(-)

diff --git a/doc/specs/stdlib_intrinsics.md b/doc/specs/stdlib_intrinsics.md
index c9baf619c..023775f02 100644
--- a/doc/specs/stdlib_intrinsics.md
+++ b/doc/specs/stdlib_intrinsics.md
@@ -11,17 +11,17 @@ title: intrinsics
 The `stdlib_intrinsics` module provides replacements for some of the well known intrinsic functions found in Fortran compilers for which either a faster and/or more accurate implementation is found which has also proven of interest to the Fortran community.
 
 <!-- -- -- -- -- -- -- -- -- -- -- -- -- -- -- -- -- -- -- -->
-### `fsum` function
+### `stdlib_sum` function
 
 #### Description
 
-The `fsum` function can replace the intrinsic `sum` for `real` or `complex` arrays. It follows a chunked implementation which maximizes vectorization potential as well as reducing the round-off error. This procedure is recommended when summing large arrays, for repetitive summation of smaller arrays consider the classical `sum`.
+The `stdlib_sum` function can replace the intrinsic `sum` for `real` or `complex` arrays. It follows a chunked implementation which maximizes vectorization potential as well as reducing the round-off error. This procedure is recommended when summing large arrays, for repetitive summation of smaller arrays consider the classical `sum`.
 
 #### Syntax
 
-`res = ` [[stdlib_intrinsics(module):fsum(interface)]] ` (x [,mask] )`
+`res = ` [[stdlib_intrinsics(module):stdlib_sum(interface)]] ` (x [,mask] )`
 
-`res = ` [[stdlib_intrinsics(module):fsum(interface)]] ` (x, dim [,mask] )`
+`res = ` [[stdlib_intrinsics(module):stdlib_sum(interface)]] ` (x, dim [,mask] )`
 
 #### Status
 
@@ -44,11 +44,11 @@ Pure function.
 If `dim` is absent, the output is a scalar of the same `type` and `kind` as to that of `x`. Otherwise, an array of rank n-1, where n equals the rank of `x`, and a shape similar to that of `x` with dimension `dim` dropped is returned.
 
 <!-- -- -- -- -- -- -- -- -- -- -- -- -- -- -- -- -- -- -- -->
-### `fsum_kahan` function
+### `stdlib_sum_kahan` function
 
 #### Description
 
-The `fsum_kahan` function can replace the intrinsic `sum` for 1D `real` or `complex` arrays. It follows a chunked implementation which maximizes vectorization potential, complemented by an `elemental` kernel based on the [kahan summation](https://en.wikipedia.org/wiki/Kahan_summation_algorithm) strategy to reduce the round-off error:
+The `stdlib_sum_kahan` function can replace the intrinsic `sum` for 1D `real` or `complex` arrays. It follows a chunked implementation which maximizes vectorization potential, complemented by an `elemental` kernel based on the [kahan summation](https://en.wikipedia.org/wiki/Kahan_summation_algorithm) strategy to reduce the round-off error:
 
 ```fortran
 elemental subroutine kahan_kernel_<kind>(a,s,c)
@@ -65,7 +65,7 @@ end subroutine
 
 #### Syntax
 
-`res = ` [[stdlib_intrinsics(module):fsum_kahan(interface)]] ` (x [,mask] )`
+`res = ` [[stdlib_intrinsics(module):stdlib_sum_kahan(interface)]] ` (x [,mask] )`
 
 #### Status
 
@@ -92,15 +92,15 @@ The output is a scalar of `type` and `kind` same as to that of `x`.
 ```
 
 <!-- -- -- -- -- -- -- -- -- -- -- -- -- -- -- -- -- -- -- -->
-### `fprod` function
+### `stdlib_dot_product` function
 
 #### Description
 
-The `fprod` function can replace the intrinsic `dot_product` for 1D `real` or `complex` arrays. It follows a chunked implementation which maximizes vectorization potential as well as reducing the round-off error. This procedure is recommended when crunching large arrays, for repetitive products of smaller arrays consider the classical `dot_product`.
+The `stdlib_dot_product` function can replace the intrinsic `dot_product` for 1D `real` or `complex` arrays. It follows a chunked implementation which maximizes vectorization potential as well as reducing the round-off error. This procedure is recommended when crunching large arrays, for repetitive products of smaller arrays consider the classical `dot_product`.
 
 #### Syntax
 
-`res = ` [[stdlib_intrinsics(module):fprod(interface)]] ` (x, y)`
+`res = ` [[stdlib_intrinsics(module):stdlib_dot_product(interface)]] ` (x, y)`
 
 #### Status
 
@@ -121,15 +121,15 @@ Pure function.
 The output is a scalar of `type` and `kind` same as to that of `x` and `y`.
 
 <!-- -- -- -- -- -- -- -- -- -- -- -- -- -- -- -- -- -- -- -->
-### `fprod_kahan` function
+### `stdlib_dot_product_kahan` function
 
 #### Description
 
-The `fprod_kahan` function can replace the intrinsic `dot_product` for 1D `real` or `complex` arrays. It follows a chunked implementation which maximizes vectorization potential , complemented by the same `elemental` kernel based on the [kahan summation](https://en.wikipedia.org/wiki/Kahan_summation_algorithm) used for `fsum` to reduce the round-off error.
+The `stdlib_dot_product_kahan` function can replace the intrinsic `dot_product` for 1D `real` or `complex` arrays. It follows a chunked implementation which maximizes vectorization potential , complemented by the same `elemental` kernel based on the [kahan summation](https://en.wikipedia.org/wiki/Kahan_summation_algorithm) used for `stdlib_sum` to reduce the round-off error.
 
 #### Syntax
 
-`res = ` [[stdlib_intrinsics(module):fprod_kahan(interface)]] ` (x, y)`
+`res = ` [[stdlib_intrinsics(module):stdlib_dot_product_kahan(interface)]] ` (x, y)`
 
 #### Status
 
diff --git a/example/intrinsics/example_dot_product.f90 b/example/intrinsics/example_dot_product.f90
index c0d111d83..fb6e40610 100644
--- a/example/intrinsics/example_dot_product.f90
+++ b/example/intrinsics/example_dot_product.f90
@@ -1,6 +1,6 @@
 program example_dot_product
     use stdlib_kinds, only: sp
-    use stdlib_intrinsics, only: fprod, fprod_kahan
+    use stdlib_intrinsics, only: stdlib_dot_product, stdlib_dot_product_kahan
     implicit none
 
     real(sp), allocatable :: x(:), y(:)
@@ -11,8 +11,8 @@ program example_dot_product
     call random_number(y)
 
     total_prod(1) = dot_product(x,y) !> compiler intrinsic
-    total_prod(2) = fprod(x,y)       !> chunked summation over inner product
-    total_prod(3) = fprod_kahan(x,y) !> chunked kahan summation over inner product
+    total_prod(2) = stdlib_dot_product(x,y)       !> chunked summation over inner product
+    total_prod(3) = stdlib_dot_product_kahan(x,y) !> chunked kahan summation over inner product
     print *, total_prod(1:3)
     
 end program example_dot_product
\ No newline at end of file
diff --git a/example/intrinsics/example_sum.f90 b/example/intrinsics/example_sum.f90
index 70c59d954..0aec4835c 100644
--- a/example/intrinsics/example_sum.f90
+++ b/example/intrinsics/example_sum.f90
@@ -1,6 +1,6 @@
 program example_sum
     use stdlib_kinds, only: sp
-    use stdlib_intrinsics, only: fsum, fsum_kahan
+    use stdlib_intrinsics, only: stdlib_sum, stdlib_sum_kahan
     implicit none
 
     real(sp), allocatable :: x(:)
@@ -10,8 +10,8 @@ program example_sum
     call random_number(x)
 
     total_sum(1) = sum(x)       !> compiler intrinsic
-    total_sum(2) = fsum(x)      !> chunked summation
-    total_sum(3) = fsum_kahan(x)!> chunked kahan summation
+    total_sum(2) = stdlib_sum(x)      !> chunked summation
+    total_sum(3) = stdlib_sum_kahan(x)!> chunked kahan summation
     print *, total_sum(1:3)
     
 end program example_sum
\ No newline at end of file
diff --git a/src/stdlib_intrinsics.fypp b/src/stdlib_intrinsics.fypp
index f8da7a876..bf20c0fa3 100644
--- a/src/stdlib_intrinsics.fypp
+++ b/src/stdlib_intrinsics.fypp
@@ -12,24 +12,24 @@ module stdlib_intrinsics
     implicit none
     private
 
-    interface fsum 
+    interface stdlib_sum 
         #:for rk, rt, rs in RC_KINDS_TYPES
-        pure module function fsum_1d_${rs}$(a) result(s)
+        pure module function stdlib_sum_1d_${rs}$(a) result(s)
             ${rt}$, intent(in) :: a(:)
             ${rt}$ :: s
         end function
-        pure module function fsum_1d_${rs}$_mask(a,mask) result(s)
+        pure module function stdlib_sum_1d_${rs}$_mask(a,mask) result(s)
             ${rt}$, intent(in) :: a(:)
             logical, intent(in) :: mask(:)
             ${rt}$ :: s
         end function
         #:for rank in RANKS
-        pure module function fsum_${rank}$d_${rs}$( x, mask ) result( s )
+        pure module function stdlib_sum_${rank}$d_${rs}$( x, mask ) result( s )
             ${rt}$, intent(in) :: x${ranksuffix(rank)}$
             logical, intent(in), optional :: mask${ranksuffix(rank)}$
             ${rt}$ :: s
         end function
-        pure module function fsum_${rank}$d_dim_${rs}$( x , dim, mask ) result( s )
+        pure module function stdlib_sum_${rank}$d_dim_${rs}$( x , dim, mask ) result( s )
             ${rt}$, intent(in) :: x${ranksuffix(rank)}$
             integer, intent(in):: dim
             logical, intent(in), optional :: mask${ranksuffix(rank)}$
@@ -38,44 +38,44 @@ module stdlib_intrinsics
         #:endfor
         #:endfor
     end interface
-    public :: fsum
+    public :: stdlib_sum
 
-    interface fsum_kahan
+    interface stdlib_sum_kahan
         #:for rk, rt, rs in RC_KINDS_TYPES
-        pure module function fsum_kahan_1d_${rs}$(a) result(s)
+        pure module function stdlib_sum_kahan_1d_${rs}$(a) result(s)
             ${rt}$, intent(in) :: a(:)
             ${rt}$ :: s
         end function
-        pure module function fsum_kahan_1d_${rs}$_mask(a,mask) result(s)
+        pure module function stdlib_sum_kahan_1d_${rs}$_mask(a,mask) result(s)
             ${rt}$, intent(in) :: a(:)
             logical, intent(in) :: mask(:)
             ${rt}$ :: s
         end function
         #:endfor
     end interface
-    public :: fsum_kahan
+    public :: stdlib_sum_kahan
 
-    interface fprod
+    interface stdlib_dot_product
         #:for rk, rt, rs in RC_KINDS_TYPES
-        pure module function fprod_${rs}$(a,b) result(p)
+        pure module function stdlib_dot_product_${rs}$(a,b) result(p)
             ${rt}$, intent(in) :: a(:)
             ${rt}$, intent(in) :: b(:)
             ${rt}$ :: p
         end function
         #:endfor
     end interface
-    public :: fprod
+    public :: stdlib_dot_product
 
-    interface fprod_kahan
+    interface stdlib_dot_product_kahan
         #:for rk, rt, rs in RC_KINDS_TYPES
-        pure module function fprod_kahan_${rs}$(a,b) result(p)
+        pure module function stdlib_dot_product_kahan_${rs}$(a,b) result(p)
             ${rt}$, intent(in) :: a(:)
             ${rt}$, intent(in) :: b(:)
             ${rt}$ :: p
         end function
         #:endfor
     end interface
-    public :: fprod_kahan
+    public :: stdlib_dot_product_kahan
 
     interface kahan_kernel 
         #:for rk, rt, rs in RC_KINDS_TYPES
diff --git a/src/stdlib_intrinsics_dot_product.fypp b/src/stdlib_intrinsics_dot_product.fypp
index 73e4fffcc..551756a9c 100644
--- a/src/stdlib_intrinsics_dot_product.fypp
+++ b/src/stdlib_intrinsics_dot_product.fypp
@@ -24,7 +24,7 @@ submodule(stdlib_intrinsics) stdlib_intrinsics_dot_product
 contains
 
 #:for k1, t1, s1 in RC_KINDS_TYPES
-pure module function fprod_${s1}$(a,b) result(p)
+pure module function stdlib_dot_product_${s1}$(a,b) result(p)
     ${t1}$, intent(in) :: a(:)
     ${t1}$, intent(in) :: b(:)
     ${t1}$ :: p
@@ -48,7 +48,7 @@ end function
 #:endfor
 
 #:for k1, t1, s1 in RC_KINDS_TYPES
-pure module function fprod_kahan_${s1}$(a,b) result(p)
+pure module function stdlib_dot_product_kahan_${s1}$(a,b) result(p)
     ${t1}$, intent(in) :: a(:)
     ${t1}$, intent(in) :: b(:)
     ${t1}$ :: p
diff --git a/src/stdlib_intrinsics_sum.fypp b/src/stdlib_intrinsics_sum.fypp
index 65ab2f574..f6b771cf9 100644
--- a/src/stdlib_intrinsics_sum.fypp
+++ b/src/stdlib_intrinsics_sum.fypp
@@ -18,7 +18,7 @@ contains
 
 !================= 1D Base implementations ============
 #:for rk, rt, rs in RC_KINDS_TYPES
-pure module function fsum_1d_${rs}$(a) result(s)
+pure module function stdlib_sum_1d_${rs}$(a) result(s)
     ${rt}$, intent(in) :: a(:)
     ${rt}$ :: s
     ${rt}$ :: abatch(chunk)
@@ -39,7 +39,7 @@ pure module function fsum_1d_${rs}$(a) result(s)
     end do
 end function
 
-pure module function fsum_1d_${rs}$_mask(a,mask) result(s)
+pure module function stdlib_sum_1d_${rs}$_mask(a,mask) result(s)
     ${rt}$, intent(in) :: a(:)
     logical, intent(in) :: mask(:)
     ${rt}$ :: s
@@ -61,7 +61,7 @@ pure module function fsum_1d_${rs}$_mask(a,mask) result(s)
     end do
 end function
 
-pure module function fsum_kahan_1d_${rs}$(a) result(s)
+pure module function stdlib_sum_kahan_1d_${rs}$(a) result(s)
     ${rt}$, intent(in) :: a(:)
     ${rt}$ :: s
     ${rt}$ :: sbatch(chunk)
@@ -83,7 +83,7 @@ pure module function fsum_kahan_1d_${rs}$(a) result(s)
     end do
 end function
 
-pure module function fsum_kahan_1d_${rs}$_mask(a,mask) result(s)
+pure module function stdlib_sum_kahan_1d_${rs}$_mask(a,mask) result(s)
     ${rt}$, intent(in) :: a(:)
     logical, intent(in) :: mask(:)
     ${rt}$ :: s
@@ -110,7 +110,7 @@ end function
 !================= N-D implementations ============
 #:for rk, rt, rs in RC_KINDS_TYPES
 #:for rank in RANKS
-pure module function fsum_${rank}$d_${rs}$( x , mask ) result( s )
+pure module function stdlib_sum_${rank}$d_${rs}$( x , mask ) result( s )
     ${rt}$, intent(in) :: x${ranksuffix(rank)}$
     logical, intent(in), optional :: mask${ranksuffix(rank)}$
     ${rt}$ :: s
@@ -123,17 +123,17 @@ contains
     pure ${rt}$ function sum_recast(b,n)
       integer, intent(in) :: n
       ${rt}$, intent(in) :: b(n)
-      sum_recast = fsum(b)
+      sum_recast = stdlib_sum(b)
     end function
     pure ${rt}$ function sum_recast_mask(b,m,n)
       integer, intent(in) :: n
       ${rt}$, intent(in) :: b(n)
       logical, intent(in) :: m(n)
-      sum_recast_mask = fsum(b,m)
+      sum_recast_mask = stdlib_sum(b,m)
     end function
 end function
 
-pure module function fsum_${rank}$d_dim_${rs}$( x , dim, mask ) result( s )
+pure module function stdlib_sum_${rank}$d_dim_${rs}$( x , dim, mask ) result( s )
     ${rt}$, intent(in) :: x${ranksuffix(rank)}$
     integer, intent(in):: dim
     logical, intent(in), optional :: mask${ranksuffix(rank)}$
@@ -144,17 +144,17 @@ pure module function fsum_${rank}$d_dim_${rs}$( x , dim, mask ) result( s )
         if(dim<${rank}$)then
             do j = 1, size(x,dim=${rank}$)
                 #:if rank == 2
-                s${select_subarray(rank-1, [(rank-1, 'j')])}$ = fsum( x${select_subarray(rank, [(rank, 'j')])}$ )
+                s${select_subarray(rank-1, [(rank-1, 'j')])}$ = stdlib_sum( x${select_subarray(rank, [(rank, 'j')])}$ )
                 #:else
-                s${select_subarray(rank-1, [(rank-1, 'j')])}$ = fsum( x${select_subarray(rank, [(rank, 'j')])}$, dim=dim )
+                s${select_subarray(rank-1, [(rank-1, 'j')])}$ = stdlib_sum( x${select_subarray(rank, [(rank, 'j')])}$, dim=dim )
                 #:endif
             end do
         else
             do j = 1, size(x,dim=1)
                 #:if rank == 2
-                s${select_subarray(rank-1, [(1, 'j')])}$ = fsum( x${select_subarray(rank, [(1, 'j')])}$ )
+                s${select_subarray(rank-1, [(1, 'j')])}$ = stdlib_sum( x${select_subarray(rank, [(1, 'j')])}$ )
                 #:else
-                s${select_subarray(rank-1, [(1, 'j')])}$ = fsum( x${select_subarray(rank, [(1, 'j')])}$, dim=${rank-1}$ )
+                s${select_subarray(rank-1, [(1, 'j')])}$ = stdlib_sum( x${select_subarray(rank, [(1, 'j')])}$, dim=${rank-1}$ )
                 #:endif
             end do
         end if
@@ -162,17 +162,17 @@ pure module function fsum_${rank}$d_dim_${rs}$( x , dim, mask ) result( s )
         if(dim<${rank}$)then
             do j = 1, size(x,dim=${rank}$)
                 #:if rank == 2
-                s${select_subarray(rank-1, [(rank-1, 'j')])}$ = fsum( x${select_subarray(rank, [(rank, 'j')])}$, mask=mask${select_subarray(rank, [(rank, 'j')])}$ )
+                s${select_subarray(rank-1, [(rank-1, 'j')])}$ = stdlib_sum( x${select_subarray(rank, [(rank, 'j')])}$, mask=mask${select_subarray(rank, [(rank, 'j')])}$ )
                 #:else
-                s${select_subarray(rank-1, [(rank-1, 'j')])}$ = fsum( x${select_subarray(rank, [(rank, 'j')])}$, dim=dim, mask=mask${select_subarray(rank, [(rank, 'j')])}$ )
+                s${select_subarray(rank-1, [(rank-1, 'j')])}$ = stdlib_sum( x${select_subarray(rank, [(rank, 'j')])}$, dim=dim, mask=mask${select_subarray(rank, [(rank, 'j')])}$ )
                 #:endif
             end do
         else
             do j = 1, size(x,dim=1)
                 #:if rank == 2
-                s${select_subarray(rank-1, [(1, 'j')])}$ = fsum( x${select_subarray(rank, [(1, 'j')])}$, mask=mask${select_subarray(rank, [(1, 'j')])}$ )
+                s${select_subarray(rank-1, [(1, 'j')])}$ = stdlib_sum( x${select_subarray(rank, [(1, 'j')])}$, mask=mask${select_subarray(rank, [(1, 'j')])}$ )
                 #:else
-                s${select_subarray(rank-1, [(1, 'j')])}$ = fsum( x${select_subarray(rank, [(1, 'j')])}$, dim=${rank-1}$, mask=mask${select_subarray(rank, [(1, 'j')])}$ )
+                s${select_subarray(rank-1, [(1, 'j')])}$ = stdlib_sum( x${select_subarray(rank, [(1, 'j')])}$, dim=${rank-1}$, mask=mask${select_subarray(rank, [(1, 'j')])}$ )
                 #:endif
             end do
         end if
diff --git a/test/intrinsics/test_intrinsics.fypp b/test/intrinsics/test_intrinsics.fypp
index 19b6e4358..a8a14b701 100644
--- a/test/intrinsics/test_intrinsics.fypp
+++ b/test/intrinsics/test_intrinsics.fypp
@@ -55,16 +55,16 @@ subroutine test_sum(error)
         end do
         
         xsum(1) = sum(x)        ! compiler intrinsic
-        xsum(2) = fsum_kahan(x) ! chunked Kahan summation
-        xsum(3) = fsum(x)       ! chunked summation
+        xsum(2) = stdlib_sum_kahan(x) ! chunked Kahan summation
+        xsum(3) = stdlib_sum(x)       ! chunked summation
         err(1:ncalc) = abs(1._${k1}$-xsum(1:ncalc)/total_sum)
         
         call check(error, all(err(:)<tolerance) , "real sum is not accurate" )
         if (allocated(error)) return
 
         xsum(1) = sum(x,mask)+sum(x,nmask) ! compiler intrinsic
-        xsum(2) = fsum_kahan(x,mask)+fsum_kahan(x,nmask) ! chunked Kahan summation
-        xsum(3) = fsum(x,mask)+fsum(x,nmask) ! chunked summation
+        xsum(2) = stdlib_sum_kahan(x,mask)+stdlib_sum_kahan(x,nmask) ! chunked Kahan summation
+        xsum(3) = stdlib_sum(x,mask)+stdlib_sum(x,nmask) ! chunked summation
         err(1:ncalc) = abs(1._${k1}$-xsum(1:ncalc)/total_sum)
         
         call check(error, all(err(:)<tolerance) , "masked real sum is not accurate" )
@@ -97,8 +97,8 @@ subroutine test_sum(error)
         end do
         
         xsum(1) = sum(x)        ! compiler intrinsic
-        xsum(2) = fsum_kahan(x) ! chunked Kahan summation
-        xsum(3) = fsum(x)       ! chunked summation
+        xsum(2) = stdlib_sum_kahan(x) ! chunked Kahan summation
+        xsum(3) = stdlib_sum(x)       ! chunked summation
         err(1:ncalc) = abs(1._${k1}$-(xsum(1:ncalc)%re)/total_sum)
         
         
@@ -106,8 +106,8 @@ subroutine test_sum(error)
         if (allocated(error)) return
 
         xsum(1) = sum(x,mask)+sum(x,nmask) ! compiler intrinsic
-        xsum(2) = fsum_kahan(x,mask)+fsum_kahan(x,nmask) ! chunked Kahan summation
-        xsum(3) = fsum(x,mask)+fsum(x,nmask) ! chunked summation
+        xsum(2) = stdlib_sum_kahan(x,mask)+stdlib_sum_kahan(x,nmask) ! chunked Kahan summation
+        xsum(3) = stdlib_sum(x,mask)+stdlib_sum(x,nmask) ! chunked summation
         err(1:ncalc) = abs(1._${k1}$-(xsum(1:ncalc)%re)/total_sum)
         
         call check(error, all(err(:)<tolerance) , "complex masked sum is not accurate" )
@@ -124,13 +124,13 @@ subroutine test_sum(error)
         allocate(x(100,100,10))
         call random_number(x)
         !> sum all elements
-        call check(error, abs( sum(x) - fsum(x) )<tolerance*size(x) , "KO: full ndarray fsum" )
+        call check(error, abs( sum(x) - stdlib_sum(x) )<tolerance*size(x) , "KO: full ndarray stdlib_sum" )
         if (allocated(error)) return
 
         !> sum over specific rank dim
         do i = 1, rank(x)
-            call check(error, norm2( sum(x,dim=i) - fsum(x,dim=i) )<tolerance*size(x) ,&
-                        "KO: ndarray fsum over dim "//to_string(i) )
+            call check(error, norm2( sum(x,dim=i) - stdlib_sum(x,dim=i) )<tolerance*size(x) ,&
+                        "KO: ndarray stdlib_sum over dim "//to_string(i) )
             if (allocated(error)) return
         end do
     end block ndarray
@@ -164,8 +164,8 @@ subroutine test_dot_product(error)
         end do
         
         xsum(1) = dot_product(x,x) ! compiler intrinsic
-        xsum(2) = fprod_kahan(x,x) ! chunked Kahan summation
-        xsum(3) = fprod(x,x)       ! chunked summation
+        xsum(2) = stdlib_dot_product_kahan(x,x) ! chunked Kahan summation
+        xsum(3) = stdlib_dot_product(x,x)       ! chunked summation
         err(1:ncalc) = abs(1._${k1}$-xsum(1:ncalc)/total_sum)
         
         call check(error, all(err(:)<tolerance) , "real dot_product is not accurate" )
@@ -192,8 +192,8 @@ subroutine test_dot_product(error)
         end do
         
         xsum(1) = dot_product(x,x) ! compiler intrinsic
-        xsum(2) = fprod_kahan(x,x) ! chunked Kahan dot_product
-        xsum(3) = fprod(x,x)       ! chunked dot_product
+        xsum(2) = stdlib_dot_product_kahan(x,x) ! chunked Kahan dot_product
+        xsum(3) = stdlib_dot_product(x,x)       ! chunked dot_product
         err(1:ncalc) = abs(1._${k1}$-xsum(1:ncalc)%re/(2*total_sum))
          
         call check(error, all(err(:)<tolerance) , "complex dot_product is not accurate" )

From aaa68bc2475c810c906f59e2b0254447c18126dc Mon Sep 17 00:00:00 2001
From: jalvesz <jose.alves@transvalor.com>
Date: Sun, 12 Jan 2025 11:17:29 +0100
Subject: [PATCH 17/18] add comments

---
 src/stdlib_intrinsics.fypp             | 52 +++++++++++++++++++++++++-
 src/stdlib_intrinsics_dot_product.fypp |  3 +-
 src/stdlib_intrinsics_sum.fypp         |  3 +-
 3 files changed, 52 insertions(+), 6 deletions(-)

diff --git a/src/stdlib_intrinsics.fypp b/src/stdlib_intrinsics.fypp
index bf20c0fa3..12dbac190 100644
--- a/src/stdlib_intrinsics.fypp
+++ b/src/stdlib_intrinsics.fypp
@@ -4,7 +4,6 @@
 #:set RC_KINDS_TYPES = R_KINDS_TYPES + C_KINDS_TYPES
 #:set RANKS = range(2, MAXRANK + 1)
 
-! This module is based on https://github.com/jalvesz/fast_math
 module stdlib_intrinsics
     !!Replacement for certain Fortran intrinsic functions offering either faster and/or more accurate implementations.
     !! ([Specification](../page/specs/stdlib_intrinsics.html))
@@ -12,7 +11,20 @@ module stdlib_intrinsics
     implicit none
     private
 
-    interface stdlib_sum 
+    interface stdlib_sum
+        !! version: experimental 
+        !!
+        !!### Summary 
+        !! Sum elements of rank N arrays. 
+        !! ([Specification](../page/specs/stdlib_intrinsics.html#stdlib_sum))
+        !!
+        !!### Description
+        !! 
+        !! This interface provides standard conforming call for sum of elements of any rank.
+        !! The 1-D base implementation follows a chunked approach for optimizing performance and increasing accuracy.
+        !! The `N-D` interfaces calls upon the `(N-1)-D` implementation. 
+        !! Supported data types include `real` and `complex`.
+        !!
         #:for rk, rt, rs in RC_KINDS_TYPES
         pure module function stdlib_sum_1d_${rs}$(a) result(s)
             ${rt}$, intent(in) :: a(:)
@@ -41,6 +53,18 @@ module stdlib_intrinsics
     public :: stdlib_sum
 
     interface stdlib_sum_kahan
+        !! version: experimental 
+        !!
+        !!### Summary 
+        !! Sum elements of rank 1 arrays. 
+        !! ([Specification](../page/specs/stdlib_intrinsics.html#stdlib_sum_kahan))
+        !!
+        !!### Description
+        !! 
+        !! This interface provides standard conforming call for sum of elements of rank 1.
+        !! The 1-D base implementation follows a chunked approach combined with a kahan kernel for optimizing performance and increasing accuracy.
+        !! Supported data types include `real` and `complex`.
+        !!
         #:for rk, rt, rs in RC_KINDS_TYPES
         pure module function stdlib_sum_kahan_1d_${rs}$(a) result(s)
             ${rt}$, intent(in) :: a(:)
@@ -56,6 +80,18 @@ module stdlib_intrinsics
     public :: stdlib_sum_kahan
 
     interface stdlib_dot_product
+        !! version: experimental 
+        !!
+        !!### Summary 
+        !! dot_product of rank 1 arrays. 
+        !! ([Specification](../page/specs/stdlib_intrinsics.html#stdlib_dot_product))
+        !!
+        !!### Description
+        !! 
+        !! compute the dot_product of rank 1 arrays.
+        !! The 1-D base implementation follows a chunked approach for optimizing performance and increasing accuracy.
+        !! Supported data types include `real` and `complex`.
+        !!
         #:for rk, rt, rs in RC_KINDS_TYPES
         pure module function stdlib_dot_product_${rs}$(a,b) result(p)
             ${rt}$, intent(in) :: a(:)
@@ -67,6 +103,18 @@ module stdlib_intrinsics
     public :: stdlib_dot_product
 
     interface stdlib_dot_product_kahan
+        !! version: experimental 
+        !!
+        !!### Summary 
+        !! dot_product of rank 1 arrays. 
+        !! ([Specification](../page/specs/stdlib_intrinsics.html#stdlib_dot_product_kahan))
+        !!
+        !!### Description
+        !! 
+        !! compute the dot_product of rank 1 arrays.
+        !! The implementation follows a chunked approach combined with a kahan kernel for optimizing performance and increasing accuracy.
+        !! Supported data types include `real` and `complex`.
+        !!
         #:for rk, rt, rs in RC_KINDS_TYPES
         pure module function stdlib_dot_product_kahan_${rs}$(a,b) result(p)
             ${rt}$, intent(in) :: a(:)
diff --git a/src/stdlib_intrinsics_dot_product.fypp b/src/stdlib_intrinsics_dot_product.fypp
index 551756a9c..5120c290a 100644
--- a/src/stdlib_intrinsics_dot_product.fypp
+++ b/src/stdlib_intrinsics_dot_product.fypp
@@ -11,7 +11,6 @@ ${expression}$
 #:endif
 #:enddef
 
-! This module is based on https://github.com/jalvesz/fast_math
 submodule(stdlib_intrinsics) stdlib_intrinsics_dot_product
     !!Replacement for certain Fortran intrinsic functions offering either faster and/or more accurate implementations.
     !! ([Specification](../page/specs/stdlib_intrinsics.html))
@@ -22,7 +21,7 @@ submodule(stdlib_intrinsics) stdlib_intrinsics_dot_product
     integer, parameter :: chunk = 64
     
 contains
-
+! This implementation is based on https://github.com/jalvesz/fast_math
 #:for k1, t1, s1 in RC_KINDS_TYPES
 pure module function stdlib_dot_product_${s1}$(a,b) result(p)
     ${t1}$, intent(in) :: a(:)
diff --git a/src/stdlib_intrinsics_sum.fypp b/src/stdlib_intrinsics_sum.fypp
index f6b771cf9..6d9053345 100644
--- a/src/stdlib_intrinsics_sum.fypp
+++ b/src/stdlib_intrinsics_sum.fypp
@@ -4,9 +4,7 @@
 #:set RC_KINDS_TYPES = R_KINDS_TYPES + C_KINDS_TYPES
 #:set RANKS = range(2, MAXRANK + 1)
 
-! This module is based on https://github.com/jalvesz/fast_math
 submodule(stdlib_intrinsics) stdlib_intrinsics_sum
-    !!Replacement for certain Fortran intrinsic functions offering either faster and/or more accurate implementations.
     !! ([Specification](../page/specs/stdlib_intrinsics.html))
     use stdlib_kinds
     use stdlib_constants
@@ -17,6 +15,7 @@ submodule(stdlib_intrinsics) stdlib_intrinsics_sum
 contains
 
 !================= 1D Base implementations ============
+! This implementation is based on https://github.com/jalvesz/fast_math
 #:for rk, rt, rs in RC_KINDS_TYPES
 pure module function stdlib_sum_1d_${rs}$(a) result(s)
     ${rt}$, intent(in) :: a(:)

From 6e36b6f9a79c5aa28e3064a7eb56aebe30e8b17e Mon Sep 17 00:00:00 2001
From: Jose Alves <jose.alves@transvalor.com>
Date: Fri, 17 Jan 2025 19:56:39 +0100
Subject: [PATCH 18/18] extend kahan sum for rank N arrays

---
 doc/specs/stdlib_intrinsics.md       | 10 +++-
 src/stdlib_intrinsics.fypp           | 18 +++++-
 src/stdlib_intrinsics_sum.fypp       | 84 ++++++++++++++++++++++++++--
 test/intrinsics/test_intrinsics.fypp |  7 +++
 4 files changed, 110 insertions(+), 9 deletions(-)

diff --git a/doc/specs/stdlib_intrinsics.md b/doc/specs/stdlib_intrinsics.md
index 023775f02..bba436d7b 100644
--- a/doc/specs/stdlib_intrinsics.md
+++ b/doc/specs/stdlib_intrinsics.md
@@ -48,7 +48,7 @@ If `dim` is absent, the output is a scalar of the same `type` and `kind` as to t
 
 #### Description
 
-The `stdlib_sum_kahan` function can replace the intrinsic `sum` for 1D `real` or `complex` arrays. It follows a chunked implementation which maximizes vectorization potential, complemented by an `elemental` kernel based on the [kahan summation](https://en.wikipedia.org/wiki/Kahan_summation_algorithm) strategy to reduce the round-off error:
+The `stdlib_sum_kahan` function can replace the intrinsic `sum` for `real` or `complex` arrays. It follows a chunked implementation which maximizes vectorization potential complemented by an `elemental` kernel based on the [kahan summation](https://en.wikipedia.org/wiki/Kahan_summation_algorithm) strategy to reduce the round-off error:
 
 ```fortran
 elemental subroutine kahan_kernel_<kind>(a,s,c)
@@ -67,6 +67,8 @@ end subroutine
 
 `res = ` [[stdlib_intrinsics(module):stdlib_sum_kahan(interface)]] ` (x [,mask] )`
 
+`res = ` [[stdlib_intrinsics(module):stdlib_sum_kahan(interface)]] ` (x, dim [,mask] )`
+
 #### Status
 
 Experimental
@@ -79,11 +81,13 @@ Pure function.
 
 `x`: 1D array of either `real` or `complex` type. This argument is `intent(in)`.
 
-`mask` (optional): 1D array of `logical` values. This argument is `intent(in)`.
+`dim` (optional): scalar of type `integer` with a value in the range from 1 to n, where n equals the rank of `x`.
+
+`mask` (optional): N-D array of `logical` values, with the same shape as `x`. This argument is `intent(in)`.
 
 #### Output value or Result value
 
-The output is a scalar of `type` and `kind` same as to that of `x`.
+If `dim` is absent, the output is a scalar of the same `type` and `kind` as to that of `x`. Otherwise, an array of rank n-1, where n equals the rank of `x`, and a shape similar to that of `x` with dimension `dim` dropped is returned.
 
 #### Example
 
diff --git a/src/stdlib_intrinsics.fypp b/src/stdlib_intrinsics.fypp
index 12dbac190..1910dd0ce 100644
--- a/src/stdlib_intrinsics.fypp
+++ b/src/stdlib_intrinsics.fypp
@@ -56,13 +56,14 @@ module stdlib_intrinsics
         !! version: experimental 
         !!
         !!### Summary 
-        !! Sum elements of rank 1 arrays. 
+        !! Sum elements of rank N arrays. 
         !! ([Specification](../page/specs/stdlib_intrinsics.html#stdlib_sum_kahan))
         !!
         !!### Description
         !! 
-        !! This interface provides standard conforming call for sum of elements of rank 1.
+        !! This interface provides standard conforming call for sum of elements of any rank.
         !! The 1-D base implementation follows a chunked approach combined with a kahan kernel for optimizing performance and increasing accuracy.
+        !! The `N-D` interfaces calls upon the `(N-1)-D` implementation. 
         !! Supported data types include `real` and `complex`.
         !!
         #:for rk, rt, rs in RC_KINDS_TYPES
@@ -75,6 +76,19 @@ module stdlib_intrinsics
             logical, intent(in) :: mask(:)
             ${rt}$ :: s
         end function
+        #:for rank in RANKS
+        pure module function stdlib_sum_kahan_${rank}$d_${rs}$( x, mask ) result( s )
+            ${rt}$, intent(in) :: x${ranksuffix(rank)}$
+            logical, intent(in), optional :: mask${ranksuffix(rank)}$
+            ${rt}$ :: s
+        end function
+        pure module function stdlib_sum_kahan_${rank}$d_dim_${rs}$( x , dim, mask ) result( s )
+            ${rt}$, intent(in) :: x${ranksuffix(rank)}$
+            integer, intent(in):: dim
+            logical, intent(in), optional :: mask${ranksuffix(rank)}$
+            ${rt}$ :: s${reduced_shape('x', rank, 'dim')}$
+        end function
+        #:endfor
         #:endfor
     end interface
     public :: stdlib_sum_kahan
diff --git a/src/stdlib_intrinsics_sum.fypp b/src/stdlib_intrinsics_sum.fypp
index 6d9053345..84b449608 100644
--- a/src/stdlib_intrinsics_sum.fypp
+++ b/src/stdlib_intrinsics_sum.fypp
@@ -9,8 +9,6 @@ submodule(stdlib_intrinsics) stdlib_intrinsics_sum
     use stdlib_kinds
     use stdlib_constants
     implicit none
-
-    integer, parameter :: chunk = 64
     
 contains
 
@@ -18,6 +16,7 @@ contains
 ! This implementation is based on https://github.com/jalvesz/fast_math
 #:for rk, rt, rs in RC_KINDS_TYPES
 pure module function stdlib_sum_1d_${rs}$(a) result(s)
+    integer, parameter :: chunk = 64
     ${rt}$, intent(in) :: a(:)
     ${rt}$ :: s
     ${rt}$ :: abatch(chunk)
@@ -39,6 +38,7 @@ pure module function stdlib_sum_1d_${rs}$(a) result(s)
 end function
 
 pure module function stdlib_sum_1d_${rs}$_mask(a,mask) result(s)
+    integer, parameter :: chunk = 64
     ${rt}$, intent(in) :: a(:)
     logical, intent(in) :: mask(:)
     ${rt}$ :: s
@@ -61,13 +61,14 @@ pure module function stdlib_sum_1d_${rs}$_mask(a,mask) result(s)
 end function
 
 pure module function stdlib_sum_kahan_1d_${rs}$(a) result(s)
+    integer, parameter :: chunk = 64
     ${rt}$, intent(in) :: a(:)
     ${rt}$ :: s
     ${rt}$ :: sbatch(chunk)
     ${rt}$ :: cbatch(chunk)
     integer :: i, dr, rr
     ! -----------------------------
-    dr = size(a)/(chunk)
+    dr = size(a)/chunk
     rr = size(a) - dr*chunk     
     sbatch = zero_${rs}$
     cbatch = zero_${rs}$
@@ -83,6 +84,7 @@ pure module function stdlib_sum_kahan_1d_${rs}$(a) result(s)
 end function
 
 pure module function stdlib_sum_kahan_1d_${rs}$_mask(a,mask) result(s)
+    integer, parameter :: chunk = 64
     ${rt}$, intent(in) :: a(:)
     logical, intent(in) :: mask(:)
     ${rt}$ :: s
@@ -90,7 +92,7 @@ pure module function stdlib_sum_kahan_1d_${rs}$_mask(a,mask) result(s)
     ${rt}$ :: cbatch(chunk)
     integer :: i, dr, rr
     ! -----------------------------
-    dr = size(a)/(chunk)
+    dr = size(a)/chunk
     rr = size(a) - dr*chunk     
     sbatch = zero_${rs}$
     cbatch = zero_${rs}$
@@ -181,4 +183,78 @@ end function
 #:endfor
 #:endfor
 
+#:for rk, rt, rs in RC_KINDS_TYPES
+#:for rank in RANKS
+pure module function stdlib_sum_kahan_${rank}$d_${rs}$( x , mask ) result( s )
+    ${rt}$, intent(in) :: x${ranksuffix(rank)}$
+    logical, intent(in), optional :: mask${ranksuffix(rank)}$
+    ${rt}$ :: s
+    if(.not.present(mask)) then
+        s = sum_recast(x,size(x))
+    else
+        s = sum_recast_mask(x,mask,size(x))
+    end if
+contains
+    pure ${rt}$ function sum_recast(b,n)
+      integer, intent(in) :: n
+      ${rt}$, intent(in) :: b(n)
+      sum_recast = stdlib_sum_kahan(b)
+    end function
+    pure ${rt}$ function sum_recast_mask(b,m,n)
+      integer, intent(in) :: n
+      ${rt}$, intent(in) :: b(n)
+      logical, intent(in) :: m(n)
+      sum_recast_mask = stdlib_sum_kahan(b,m)
+    end function
+end function
+
+pure module function stdlib_sum_kahan_${rank}$d_dim_${rs}$( x , dim, mask ) result( s )
+    ${rt}$, intent(in) :: x${ranksuffix(rank)}$
+    integer, intent(in):: dim
+    logical, intent(in), optional :: mask${ranksuffix(rank)}$
+    ${rt}$ :: s${reduced_shape('x', rank, 'dim')}$
+    integer :: j 
+
+    if(.not.present(mask)) then
+        if(dim<${rank}$)then
+            do j = 1, size(x,dim=${rank}$)
+                #:if rank == 2
+                s${select_subarray(rank-1, [(rank-1, 'j')])}$ = stdlib_sum_kahan( x${select_subarray(rank, [(rank, 'j')])}$ )
+                #:else
+                s${select_subarray(rank-1, [(rank-1, 'j')])}$ = stdlib_sum_kahan( x${select_subarray(rank, [(rank, 'j')])}$, dim=dim )
+                #:endif
+            end do
+        else
+            do j = 1, size(x,dim=1)
+                #:if rank == 2
+                s${select_subarray(rank-1, [(1, 'j')])}$ = stdlib_sum_kahan( x${select_subarray(rank, [(1, 'j')])}$ )
+                #:else
+                s${select_subarray(rank-1, [(1, 'j')])}$ = stdlib_sum_kahan( x${select_subarray(rank, [(1, 'j')])}$, dim=${rank-1}$ )
+                #:endif
+            end do
+        end if
+    else 
+        if(dim<${rank}$)then
+            do j = 1, size(x,dim=${rank}$)
+                #:if rank == 2
+                s${select_subarray(rank-1, [(rank-1, 'j')])}$ = stdlib_sum_kahan( x${select_subarray(rank, [(rank, 'j')])}$, mask=mask${select_subarray(rank, [(rank, 'j')])}$ )
+                #:else
+                s${select_subarray(rank-1, [(rank-1, 'j')])}$ = stdlib_sum_kahan( x${select_subarray(rank, [(rank, 'j')])}$, dim=dim, mask=mask${select_subarray(rank, [(rank, 'j')])}$ )
+                #:endif
+            end do
+        else
+            do j = 1, size(x,dim=1)
+                #:if rank == 2
+                s${select_subarray(rank-1, [(1, 'j')])}$ = stdlib_sum_kahan( x${select_subarray(rank, [(1, 'j')])}$, mask=mask${select_subarray(rank, [(1, 'j')])}$ )
+                #:else
+                s${select_subarray(rank-1, [(1, 'j')])}$ = stdlib_sum_kahan( x${select_subarray(rank, [(1, 'j')])}$, dim=${rank-1}$, mask=mask${select_subarray(rank, [(1, 'j')])}$ )
+                #:endif
+            end do
+        end if
+    end if
+
+end function
+#:endfor
+#:endfor
+
 end submodule stdlib_intrinsics_sum
diff --git a/test/intrinsics/test_intrinsics.fypp b/test/intrinsics/test_intrinsics.fypp
index a8a14b701..7ef7eddc4 100644
--- a/test/intrinsics/test_intrinsics.fypp
+++ b/test/intrinsics/test_intrinsics.fypp
@@ -127,11 +127,18 @@ subroutine test_sum(error)
         call check(error, abs( sum(x) - stdlib_sum(x) )<tolerance*size(x) , "KO: full ndarray stdlib_sum" )
         if (allocated(error)) return
 
+        call check(error, abs( sum(x) - stdlib_sum_kahan(x) )<tolerance*size(x) , "KO: full ndarray stdlib_sum_kahan" )
+        if (allocated(error)) return
+
         !> sum over specific rank dim
         do i = 1, rank(x)
             call check(error, norm2( sum(x,dim=i) - stdlib_sum(x,dim=i) )<tolerance*size(x) ,&
                         "KO: ndarray stdlib_sum over dim "//to_string(i) )
             if (allocated(error)) return
+
+            call check(error, norm2( sum(x,dim=i) - stdlib_sum_kahan(x,dim=i) )<tolerance*size(x) ,&
+                        "KO: ndarray stdlib_sum_kahan over dim "//to_string(i) )
+            if (allocated(error)) return
         end do
     end block ndarray