From 904c7499193f9d894cd557f4237d9a60ad42908d Mon Sep 17 00:00:00 2001
From: Even Rouault <even.rouault@spatialys.com>
Date: Tue, 29 Oct 2024 15:55:22 +0100
Subject: [PATCH 01/62] test_pct.py: skip on sanitize CI

---
 autotest/pyscripts/test_pct.py | 6 ++++++
 1 file changed, 6 insertions(+)
diff --git a/autotest/pyscripts/test_pct.py b/autotest/pyscripts/test_pct.py
index 88212f866c43..c549685ea5ff 100755
--- a/autotest/pyscripts/test_pct.py
+++ b/autotest/pyscripts/test_pct.py
@@ -40,6 +40,9 @@ def script_path():
 
 def test_rgb2pct_help(script_path):
 
+    if gdaltest.is_travis_branch("sanitize"):
+        pytest.skip("fails on sanitize for unknown reason")
+
     assert "ERROR" not in test_py_scripts.run_py_script(
         script_path, "rgb2pct", "--help"
     )
@@ -51,6 +54,9 @@ def test_rgb2pct_help(script_path):
 
 def test_rgb2pct_version(script_path):
 
+    if gdaltest.is_travis_branch("sanitize"):
+        pytest.skip("fails on sanitize for unknown reason")
+
     assert "ERROR" not in test_py_scripts.run_py_script(
         script_path, "rgb2pct", "--version"
     )

From 456f6f7f10c6561583ff0ca07dd35a3f7a615538 Mon Sep 17 00:00:00 2001
From: Even Rouault <even.rouault@spatialys.com>
Date: Wed, 30 Oct 2024 23:01:01 +0100
Subject: [PATCH 02/62] tests: GTiff: verify that we can generate an output
 that is byte-identical to the expected golden file.

---
 .../data/gtiff/byte_little_endian_golden.tif  | Bin 0 -> 758 bytes
 .../byte_little_endian_tiled_lzw_golden.tif   | Bin 0 -> 816 bytes
 .../gtiff/float32_little_endian_golden.tif    | Bin 0 -> 1958 bytes
 .../gtiff/uint16_little_endian_golden.tif     | Bin 0 -> 1158 bytes
 autotest/gcore/tiff_write.py                  |  27 ++++++++++++++++++
 5 files changed, 27 insertions(+)
 create mode 100644 autotest/gcore/data/gtiff/byte_little_endian_golden.tif
 create mode 100644 autotest/gcore/data/gtiff/byte_little_endian_tiled_lzw_golden.tif
 create mode 100644 autotest/gcore/data/gtiff/float32_little_endian_golden.tif
 create mode 100644 autotest/gcore/data/gtiff/uint16_little_endian_golden.tif

diff --git a/autotest/gcore/data/gtiff/byte_little_endian_golden.tif b/autotest/gcore/data/gtiff/byte_little_endian_golden.tif
new file mode 100644
index 0000000000000000000000000000000000000000..a4cbc947432c190b4c5815cbf3d54499392b807b
GIT binary patch
literal 758
zcmZutF>6#o5Z-rpoFN>F;2Dq<4^n9(v9V3SCc#{z5_s*R2m@M7=cLksYX-5{a<O~m
z60UgVH37SjAK(x0FIeg~?@saxXIbWB=9_Q6-IJ3iVJC!eFNA6wDxe3r)c}vS*jeK(
zcHVS=?^lz4_8l7YLH*$tyXM}O^9`K+0q4~K-~BQRV{or<KkF<^k$>V|F2|up-o@-{
zISTv0r@*(dINIQU`L^>vboA@dvDdG^zwEQ)s(-<q5GET8yM1jmSuN+@-&rlEVIHRV
zh$?(N2s6*;?<akJe6Tov@$7m1w0`sYtp4=z!`u4saB<NWF9M0Os;MgBC<*7pNQqlE
z5{WA4#mwLcD#j|{F%wf#5ey6C5GP_ZHM<xyOOjTUNVBX<Nr@BYSabA|8~Ra_nL?dG
z_a#NZaF#(!As;1n5q^|=iy|6P^UY?_B33FqpecBRhkK=jh82`#a}5@`p+X4XR#Nm%
zq;(-kMWbTL-yNZ@(O>{Hcq3~d-@q>3;>M!3DGOv6%VTttjqtjdwX99#1~n+)b8-g<
ka4V-UY*||vF2DcWD48T}+k~$qoUY3$=iI6PxCcl23r4iyN&o-=

literal 0
HcmV?d00001

diff --git a/autotest/gcore/data/gtiff/byte_little_endian_tiled_lzw_golden.tif b/autotest/gcore/data/gtiff/byte_little_endian_tiled_lzw_golden.tif
new file mode 100644
index 0000000000000000000000000000000000000000..b43311f0e16ab8e4e271c7e5518ddce13d58fd21
GIT binary patch
literal 816
zcmebD)MDUZU|<krU|?isU<9&6fS3`9%>-qGR53%@tUxv!lnv4&io}*dV%s9IouF(1
zpn7K{whJQ*0}D{>5fHlq#X;;_Kpc#um#>+Jff*?F35c6oco^7#G#Aj~_3g|I3P83M
zkiD^;i9rU)t^zp-2sSaqGQ=?GGfV^WJ(w66mI5)*NGR}gfG}V*OdLDy@Jz}PBy(n-
z510nA8Ns2&&H!>Shz;}xm}X+x*e(i^=h)cJ#vsbT1`IexhCXEm9uOU#!2+f=mHixD
zjLa4E6+%ON6{_;{QWXph{c6C1H4F_>=QEsV33Bu_EAkesFkHvY<Rx=uxk(Dgw`3=W
z2OFC21YGNBn)k54Wl5SfTi>a~lO9n;{A`wI{4<?zO=HWBTOGR6qvw)1tDnT<HIcWB
z1SR?o2~6o+^5t94L`T1ZS0ZzsOHFY&lWrv{aja+UiG`t7d(wo(duCmlCX%$xrFw_^
zJ7J-VYooIF#I7xu*z@SUq3Mh#8&>SiGmN)9tNro!TA!^>Gp;_%3RzU-TG5r*Q}9HE
zr9tjo=8*+VyzDKG6FQuwiiG(YWX=|9w5UAmv0SLMp*c#>qPq9s!<@%4Z6a$MJsa3e
zSN3_UImtYllpOg^T+8-c>L+0-p<p#PsjDkahWylYnJjZ@^3A|ATkR(taA?pEU2!C7
z%Ci<-iE{;$CD$Bl@zV|Lx_nDWq*_(xO!}6jLXNf7`){6bjxzW*#Yr<{rAv~)x2;jj
zg*s0zy2>%7dDe5c4J_Yk?H2@OgjlJH^|~(!$_&|+YBtxlp`k(IY=(;h*x^%F97*g0
hI$Y{J$l=$D8(otFcRVSxm7m~|5@e||&4;0(0RW3a)QSKA

literal 0
HcmV?d00001

diff --git a/autotest/gcore/data/gtiff/float32_little_endian_golden.tif b/autotest/gcore/data/gtiff/float32_little_endian_golden.tif
new file mode 100644
index 0000000000000000000000000000000000000000..96e3dfd313396fa5df81cb66b3c816b5a04ff81f
GIT binary patch
literal 1958
zcmZwHv1(Ld5C!0Wv&$M}QG`T6QY-`u1(671B_vUch;F0e6DU?%3K9@h*v`t@SCD{D
zV6m0;7Ct~8A+_^u?g<x3;4t&goO5RGpL_S<;8t62tzB+yFl>X$HVT7YzSiZ_st>yy
zZ`{kT3`Q&Mn`&(K>Q}p*=e3TrJ8o^O$9dl6!{S`}GHK%P!soA(R@C;l@N+(F(~1|%
z|7gC}wkm&OiL<}i?rEBM);KrWJ-)uD@#n|WRg@1_PjI2N(P>`mu4-$eqxq)(`q6yc
zHru$aYtX)Jw~6@b?PwKmZqN1}-Mv4(Gky8u+4ST4cdw^AJF~-8@vzj({nkzjqlc{>
zcTqkmhyk8FF*z;WR`oUt@xaqd?MBz*o1Xrnz)PN4qB_s`Wgk4X_=w@?Eer8c2em}|
zWL~_&&$%C?rAywQNo!9rJIXU#<VP>M2N8!K#6$f-)o*v80UptZuNK{+a@y3?X9e8H
z75UNIFFVrUMVGo-zvoOJUiSEZFM1p^vxk^7Pkr%TH$zXY&v!O6wd|NWy`GqwGb{^v
zMt!-chpW#|3_rch(ZhBAAL{)2T=$?BA2V}b@uGwC&bc>z9`LK<@}cR&tjx#F|5lv`
z;EQVVyi;--kn5W-=Ogz#d2aQSj)zMp>PM@VcOpF+HGc6I^ISB%ooVpZ0_S+B<&Mqu
z{W$yPaF#vbnzN`V7XinaUNo0hp1aAe`JKYee$gz75B$X3gR>m$<^ATD_aHnz=6)~8
zc{t<dH^ltxqR-<);Zwmk&C5I-T6dvFm%GtZ)5q;cyMcMRM|t+9Q@3Ac@#rJFFZcfe
Dv|d(=

literal 0
HcmV?d00001

diff --git a/autotest/gcore/data/gtiff/uint16_little_endian_golden.tif b/autotest/gcore/data/gtiff/uint16_little_endian_golden.tif
new file mode 100644
index 0000000000000000000000000000000000000000..2ea26292221b9add4628d6460d5709e237d5ba97
GIT binary patch
literal 1158
zcmZvaF>4fI5Qb;(jx)%i2D}K8Vj-0_5*ynDY!b{hD#EpkArOegb#5wMxMoSU<zaW_
z5w5t>Mz9O{0sese1xxcjGxsH1cptMfH}lRr?>D;#2QO{I%pRL%UCR_bm1V8CvyPuA
z-(APx`mJ~~>v`|73d2@@YaPS+Y>mmgX4@^y`&Rtu`@mGTU$Sr2!20rkWS7&f<?;{3
zznpe#Tk#<x?tY`a2$R1n{5ROUdA?8K>c>0x#F^jVfmttN$@N^Px10{CZ!D*M8(LrM
z%Ispt2KY}WJ@+qmM*DAHzsX<aA3q%BU%#Av&Ubf5=bk*bQ!A~qiLA2PpOtc9JWJJP
z);NNHL~~i~wbBY<Vc--(!UPX}U~12l&dHpKBCK%?C_3?BR2#=cXC=<0jBap^Y%W6|
zT4#!X+Crz%p+DxqJ(H?5<yz?9j;dW_Xh&zLkDDwU4@csvoe#E94`X2~S?q`W{T54)
z3+2G2J2SCx0~;F-HZWxH>7>1~Czwigq6-w~-lL8CO>C^}LKY|8jh&z)dQi-!*fP^m
zL{}{ov*9g#BK=fjo59xTf4oy>(LWX@rlT=!4D?)#yq8eJj$<Pzh)s^Y6E(2%O;ygv
zP0)ZkG_Wh)<4P!YK#pmLdNRFMjd`)*h5faEcC8HG9*U@;Cca8;h^=(^u4jH$LZxp!
JZ4mZTy9F7M-{b%Q

literal 0
HcmV?d00001

diff --git a/autotest/gcore/tiff_write.py b/autotest/gcore/tiff_write.py
index f803b95d24b8..3e2c8d9de67b 100755
--- a/autotest/gcore/tiff_write.py
+++ b/autotest/gcore/tiff_write.py
@@ -11901,3 +11901,30 @@ def test_tiff_write_band_IMAGERY(tmp_vsimem):
         )
     with gdal.Open(filename2) as ds:
         assert ds.GetRasterBand(1).GetMetadata_Dict("IMAGERY") == {"foo": "bar"}
+
+
+###############################################################################
+# Verify that we can generate an output that is byte-identical to the expected golden file.
+
+
+@pytest.mark.parametrize(
+    "src_filename,creation_options",
+    [
+        ("data/gtiff/byte_little_endian_golden.tif", []),
+        ("data/gtiff/uint16_little_endian_golden.tif", []),
+        ("data/gtiff/float32_little_endian_golden.tif", []),
+        (
+            "data/gtiff/byte_little_endian_tiled_lzw_golden.tif",
+            ["TILED=YES", "BLOCKXSIZE=16", "BLOCKYSIZE=16", "COMPRESS=LZW"],
+        ),
+    ],
+)
+def test_tiff_write_check_golden_file(tmp_path, src_filename, creation_options):
+
+    out_filename = str(tmp_path / "test.tif")
+    with gdal.Open(src_filename) as src_ds:
+        gdal.GetDriverByName("GTiff").CreateCopy(
+            out_filename, src_ds, options=["ENDIANNESS=LITTLE"] + creation_options
+        )
+    assert os.stat(src_filename).st_size == os.stat(out_filename).st_size
+    assert open(src_filename, "rb").read() == open(out_filename, "rb").read()

From a2fb44914d5797986d59d33fa3314062ab84996b Mon Sep 17 00:00:00 2001
From: Even Rouault <even.rouault@spatialys.com>
Date: Wed, 30 Oct 2024 23:01:29 +0100
Subject: [PATCH 03/62] tests: COG: verify that we can generate an output that
 is byte-identical to the expected golden file.

---
 autotest/gcore/cog.py                         |  27 ++++++++++++++++++
 ...blocksize_16_predictor_standard_golden.tif | Bin 0 -> 1624 bytes
 .../data/cog/byte_little_endian_golden.tif    | Bin 0 -> 1797 bytes
 3 files changed, 27 insertions(+)
 create mode 100644 autotest/gcore/data/cog/byte_little_endian_blocksize_16_predictor_standard_golden.tif
 create mode 100644 autotest/gcore/data/cog/byte_little_endian_golden.tif

diff --git a/autotest/gcore/cog.py b/autotest/gcore/cog.py
index 3ef850bb019c..0e58f0e18af7 100755
--- a/autotest/gcore/cog.py
+++ b/autotest/gcore/cog.py
@@ -13,6 +13,7 @@
 # SPDX-License-Identifier: MIT
 ###############################################################################
 
+import os
 import struct
 import sys
 
@@ -1935,3 +1936,29 @@ def test_cog_mask_band_overviews(tmp_vsimem):
     assert ds.GetRasterBand(1).IsMaskBand()
     assert ds.GetRasterBand(1).GetOverview(0).IsMaskBand()
     assert ds.GetRasterBand(1).GetOverview(1).IsMaskBand()
+
+
+###############################################################################
+# Verify that we can generate an output that is byte-identical to the expected golden file.
+
+
+@pytest.mark.parametrize(
+    "src_filename,creation_options",
+    [
+        ("data/cog/byte_little_endian_golden.tif", []),
+        (
+            "data/cog/byte_little_endian_blocksize_16_predictor_standard_golden.tif",
+            ["BLOCKSIZE=16", "PREDICTOR=STANDARD"],
+        ),
+    ],
+)
+def test_cog_write_check_golden_file(tmp_path, src_filename, creation_options):
+
+    out_filename = str(tmp_path / "test.tif")
+    with gdal.config_option("GDAL_TIFF_ENDIANNESS", "LITTLE"):
+        with gdal.Open(src_filename) as src_ds:
+            gdal.GetDriverByName("COG").CreateCopy(
+                out_filename, src_ds, options=creation_options
+            )
+    assert os.stat(src_filename).st_size == os.stat(out_filename).st_size
+    assert open(src_filename, "rb").read() == open(out_filename, "rb").read()
diff --git a/autotest/gcore/data/cog/byte_little_endian_blocksize_16_predictor_standard_golden.tif b/autotest/gcore/data/cog/byte_little_endian_blocksize_16_predictor_standard_golden.tif
new file mode 100644
index 0000000000000000000000000000000000000000..aaade76e71f797bb209095c6d665a34ef31a12c4
GIT binary patch
literal 1624
zcmb`GeN0nV6u|F$E%2-@wo^cn&LRSHq7+g-+*avp`+!P6uzgsq$iohl4TUPGAXOg%
zqDE(hX_p0@COWcZ)47=_ejqJ4r=lii8*0$XX4FNojbpakh}*qgP1qm%Z!bCT{O&pD
zoqNtb?`kw}f?fbXo|@D$CfZn}qKk|WbremiNt$F#8Ve<tN~LKssl-}eTVBJ{l1mLm
zv|N*`HZe*n*I=X=6v<O+4XOghU{q5^xzS)|bmT&VF=W+JBr>C93~6GDG<sSVa?wUo
zqlG>#X`&e!qg+Z;CdNn=QY1~Oc?EidS<h(nDub?&q%}${#ZYPutug53dIL`aL;zqM
zfWZ!?K*7c_9%2I8IUx>%93EoyP1HD!8^`hx<1j`UVj*PJI96j^z=gI4Fo=S7#G{Z+
zp%|1uvWX8kP`AOwwq`yEhx|F@<E<PZfqVz@iB=rML5_vrW}*#)k6!U_{m<&Z-in8@
zB=QX$)CM(TEO;sr02PK9mK)SK_e5(H@)IXo!$A}XhX;g#4-x<$X>$cPsAnhWNp*Ub
zBvn#G>m-}1tI8#5X?lCmXa|wtB}@y856Y_v@X$KcJ>vg+0xZn=Y66JS1W*m=h^Y`S
zhKBJIK<*OQxeAWH2Z{ua;4Y{E79ayCmKFCNViRura83?(iREUBC+$#M7iAw=JU3={
zuJv*9ucP=F$m!kd;3Z`wf4H#Rc>2u#d%lJo((7jxu~i53t4!SwG87|rxu>*?B{H9=
zZoEgIDa(KP$5w2H_YgsFU$&bni@I-$9%EZ`uu`*a3a{muWzC`^4g1IZyvBQbZFTD5
z%5x>w*)i|B{+xg{eF9|E6G8owvqM#veh$R)zdbDd{p|w5i))+(3WNFVZ=?IJRMcAJ
zNlsnY_wTQ_oL_lr`SDw;H!PEXI=JT4P}Rm|=O6kvU$}3pD_)Qsv**%-b@jzPtFk(8
zJg(ied_kA_%fY7`8kYAw+IZgMtFxEnBs;G?8r`(H<ie^`cfA4o7AB`_aKt~hWh-;x
zk$>#3ULijW>vg0EI4+?!D$M0rEsk&tcPCD8JHD7Zu}?UVA?R~Fl~3vy@hMUNz7*Z$
zi^LTi$z6EzY^)nN>(qd-!@FP`NRO@|y4~1;z~#;<1Q6pfr<3A|hMt+*VzQ}cK0Yh{
zC9i1+1e)?!fL2zb98`vjg*)jtM1rV{#(C?XxT8V!<wRj&Uz{{xJrn11UQAL>YyP3l
zaYeh|lR=#IOjfv_m3p~kk^~gkB|d%v5UedIj+bp87)h<2HQPJAeb-XIZAVLoPwqa5
z$Jl$KMe(VJ6=9W~2d9E}@X24dyX=IsC(&{47Je_jdROuDJY{;@b;2#a|FJy)rWx-{
zAFaq<Ig-o8aj6hV4Qldr5AJk*g+(+g6@-FgUsu+m)>pHt;W?s~xW0L|ozK%IWNcRn
zMpL$k_=(BZ=M_tib_8;-g7EaV)1o=;@#6c$$?tv?a@mBNeM92m?7biLr`9)Lp6ZFu
zy52r3VJJQD+0{sEblvvWr1{<bvfE``Bo?%FYKg!H$~#1rxH8g~4}<|>7iaF;iP^7t
z2z=U4PPTNWuy&4S(a~=@lwSTht|X#i*yp#fab8w|^KKAX?s@emUu@1fR9Hd3l^DGz
z?}!opz2%vW7!n!UVX6yrV7_2;Jaaa7tg}U}TM@yM6Gbhtu1P?-IBkX_zsxv))&X7c
LesrgBjZppuw${Ax

literal 0
HcmV?d00001

diff --git a/autotest/gcore/data/cog/byte_little_endian_golden.tif b/autotest/gcore/data/cog/byte_little_endian_golden.tif
new file mode 100644
index 0000000000000000000000000000000000000000..4ace013ced02eeffc8d19c292a19b56024362dcd
GIT binary patch
literal 1797
zcmZwDeN<Cr7y$4K44U8u1Y<B5A0l+nvmpp3>N?!*Uhu+px82>zrqPs~g0+zunkkhs
zLW>Pl28=Q2LoC$sky%=jnvkgtBKa69$=4v$4NXB!$nJ=a{Lww<J<snv&wbzLd0zxs
z1e^l^AQb}n3JH!$dAJlKD1dPg0&!3wLDs^YB}<mXB`gWg+WD4pn<pQX3#B**Nr5B^
zE}SC7V1@HfPcC1`OIHXn2*x;=Fhd~#(}dV;mJfr@wDT|pC{aie6i=8f;TVYUi5ef2
z;EDtVSB}FH1qO>?5QibpbX1suDiD+>6o^0^;qqYx3?VooL^-I?GaT>*02epFg}8w~
zv0UcZb=G7e?>1|9!XC4B?xD?b$Q*O#m^*906##g147reI`mP}I@T@PA06;QlFSTGk
z;70U0NHp84nGbjnK1#T;$PEZ5e2s8(kt+~Fm_eNL>_e0P^?&hM<^PmC$^f1DpKJIt
z(P@{N>7D}sVkW1F5oe++skw;etdpCIJOCQtK|GKPpo;|NJ7Y#RX(q-*q97FiVt8!0
z6c>cQldD#S$Hk$$XOg=C=RMzNbY%0Ye96TH5tKK+NFa|^rvVGZ%MY(%(Th`F6`vZ}
z&Iv1By%9@VST1;>EKP}>-uQLq;=Sh%7A3}5f?PS3zmzVI(<a4!EXu}LL<MM54`Vs_
zsT%=4yNry^QppWAd0S#njQ(5O_7+8x?DaLvzr}Wl3UHRRtZvg0&6h8$Ggj91*mry!
z;aB>0T>I3nw9u}mr7yFV?>dvURd#06i%j^HW72x0#)IyT2HMFfTN}e_f60Dp?W%No
zbg>3K>Z6ISC!4>>S+<UIq*t~0oK2X0`FKFnHcMd7kyVp7wHHU;+`guun(DQ==WbGW
zLq>+V_l_*jBVj{{(d&56`aJpAX3N-vca(1!e!kOFHm;uZ`>^qt{o&NRS*d4j>5tBO
z@#JfNXflgNr$HaS|8V^4SAQCcUv#<2kqm95dtLCk5_lp4xpKy|vcTxt{e0<HiGFRq
zY2O<UlzKL?mZsmYItL+L(%_4}mo?bI;#FiL&k?zswJfen8+mD-*MhF-!wc6Z{YYJy
zd<=;{ZeIK;G`J;Y8Fk-e)CwqT68rAWR?E^KZ8?tj3_nTUKH=XyRI?{s0@+udSJrRY
z=XdDV7H)JlPCkC>P#cBU=`Wzcon<O!>aYIVDCE~NGb{b7za>#{wam^DcF`!XxJ%Cx
zNV;hPskB?K%FO7YX_fLGy;-%cmuAV!?A6=z71jVs;Rb6tOZ#eHfIzS8D_5B|UklLI
z<y<Q_n_s&gU^%I}UT$y6y+Nn6ZM{*!vTW<83%c|AD^&eEexqw`JAbP%+jrfhTPE^v
zR@kTC8wjMj?-{6MQwjzHQCi_(rJ7kZ6sU{ZKUCSk`e-<?JyAPc>EM)%1X1AwBL=qM
zkSz$6>TL#fX2ocbPH7l5G^mVYLG5{_F@qz&_IC!g@RQ%G*xJu-F;IQottz$Y^FJ86
zy81t=8qCLTGuls@Z&x{58t(*C+nVnf*_IRbV6^+B-Kg&W#u2Quopu--?B9(Cw@<W;
z8y(YU?}m80pTAojOSy0_L`ZA9SDnke_-9B-)TKYG8(CNGhjb)b?pKd<Ivy~+;m!x9
zSi#i^rcl~FVam<yon)3M`zB3|s_PG#9eMo^P2>4DABB1s4nC@h)eb)n73yt|YjRCv
ze}$IR-TJGh(R}+!XvazWlbZ3C@u@KHwtG{xv6lPOVZ!c->Dt`>hku8a*dG5~+h~9C
zG^}G{`f2U>)YAcy%ljg#YqE^uG62onBVv0bTbXVH8sEJl)Dx7k+y`2N3PfruXyrUN
zNQ%&ibkB2Tf_Z~bOrfYDfNRb48q~ynAZlmuWGc#FYf_QO5z4dXdk>MoePZeYNT&52
zf{^`Ub~I!)`3-5rABxdMu*^&wYL$N^R<mJi3w@Zhp;)YoPmx)IhM^p-xM5j}wLf@R
zvsEW<Uy&-aGlyGumWUlMrCO)MMo4=~F)DYpoDwkt?LUCAQ&#seBS$m`4q|8;B4<U7
zv>Fa!Y82_?ykH}JQikcUG`S$g2G#4ahBaw@nTu_j=5nlkZMs~=vbCPBz#Oln_vOcq
jlFnC3s9AiuHf|KUWRS2o@%v0mM>U;Q5_FZ1k8}M42>!3C

literal 0
HcmV?d00001


From 3af474547dfd0b9230242a31a935e6ce11164bae Mon Sep 17 00:00:00 2001
From: Even Rouault <even.rouault@spatialys.com>
Date: Wed, 30 Oct 2024 23:22:02 +0100
Subject: [PATCH 04/62] test: GPKG: verify that we can generate an output that
 is byte-identical to the expected golden file.

---
 autotest/ogr/data/gpkg/poly_golden.gpkg | Bin 0 -> 106496 bytes
 autotest/ogr/ogr_gpkg.py                |  42 ++++++++++++++++++++++++
 2 files changed, 42 insertions(+)
 create mode 100644 autotest/ogr/data/gpkg/poly_golden.gpkg

diff --git a/autotest/ogr/data/gpkg/poly_golden.gpkg b/autotest/ogr/data/gpkg/poly_golden.gpkg
new file mode 100644
index 0000000000000000000000000000000000000000..ab45c9747eeedd64742493f467d223b22222e7bb
GIT binary patch
literal 106496
zcmeI54R9OBb$}2214)U5?3iLKMba9HLZSjGg1@3I#em2Y0h$CT5VTCo7621R@+#m!
z0Y_2tBq=D$`SFbFuuYSW^D{`s&ctr&WYWaW<Y$7`Njpi~IGJQrcP2`ZpV~2#G)~-c
zQmZrW-n~8WumEZ5r;(*T3<==vzTJKMz4!L+_CRp_@QAERWICBjL{-uyY!NIL!A*!D
z2=(-L8~rsNJL#J$L!hq~L1=kSlZVFc7&4Wc-_{H5_n)YyB^9<;wDN7X3-HFQgzu36
z5<mh-00|%gB!C2v01`j~NB{{Sf&U`{TW+Qu+iVT=xvKsJp|P^zvBp{Y1|LWO2_OL^
zfCP{L5<mh-;MynfK>y-%-L*cUsp7#FAuv94u&bBsB?nWoDyL_OKdQ<}B^oC~DLE#N
zjSt1U$AW?3f$^jE^>yveAu%wts%Li&dO{PUv{@>qM3pHiJU*W~DV+`{r^7=jDXNCG
zA)~SqcJ+7l+MVNLUNIQ(4IZ_7<kV^6+Sk=#clLJm_4V|2wRiRRIy?LO+q*iu`+K{4
zot}x1HxTrNCXd=R8|_YeUq^?_e($|yYI-~S`m~y{pl?*9_Rzs4WkH^rwL3e?sPF5C
zjwbxR&{2C#nxX1cv%SOB*WJ_A+27gY>hJIF>MGN;-__Y!-iGdi4S`{CAmj`9kJ>}2
zsFFS@rP5M(R7y=n)nv*}t@H#vqhd%59<|4*E7bXzq!CM|<QZA9JG=XfRHh_FO-1A3
zghah1N2x+vXOTjBDjJu<)6prdcRGvy{{G$$S7)EAuSjh=8mDGR(X=WnGjx;=?NLTU
zNv70UU8T*%bbwPrQd1O@s~i=>Gk5uD5Q)YZqGvor-K7`$3*Gty!H~C{7b(Dw4hr`k
z!FoVws@;{CP-4=wtVl63BPC~2(YaZ9ii`)x{Q<u?Z`2jun3blI$y7{MqV!&GN}85Z
z5)Dq0KAl#j#L?G`hxIn0$+GMC>L>%zlq#huV`Grl3Y{0;kX5C123fl46Vr8jX#Sm*
z7k3;P8Yla@HS=PUDo<(AyybZ#lc2JYNmQiCvaj2mEQSirlzpzQKAI)_+B<vtIy<{)
z-jwl1UkA;r@_EwL+51P%6W#eEEX?gr!}(C0RA%(>w>w#=l;nIjSR6Mr_Zk^PR_UjS
ztdN*AB`4^b2HBh@NrkAh5@GDqbF)$^DaW+ye~azY0{y`U5<mh-00|%gB!C2v01`j~
zNB{{S0VHsh3EXMf)U>0i;dU#1RLmvgr>E&|;(SU<YwtVhwp3e(i~fXe?dX7?c3O6~
zXNT)aLM`_M_qg@f9nu%wy4<#srLXaB?L+<++aCn_gAXKt1dsp{Kmter2_OL^fCP{L
z5<mh-;5s8<t!%2(whr+6|2lIlP6r7f0VIF~kN^@u0!RP}AOR$R1dxD<fN}i~pZ|YF
zz;ER<!NIpk00|%gB!C2v01`j~NB{{S0VIF~t_uRSmdYlJe)}Kq|6dml#YrFmB!C2v
z01`j~NB{{S0VIF~kN^@WOQ87v|MP<F`Lb$Qhy;)T5<mh-00|%gB!C2v01`j~NB{|3
zGX&mju{Jeruc)on?)lSyyO0<Qj7$y%{KohHy)4*XzGfzah9UtZfCP{L5<mh-00|%g
zB!C2v01`j~uTNleRTF&opSBTzpZ|Y-V5orvkN^@u0!RP}AOR$R1dsp{KmthMS|woq
z{J*f}TAc;jjRcSY5<mh-00|%gB!C2v01`j~NB{|Jgn)McAFuy6LJD<|01`j~NB{{S
z0VIF~kN^@u0!RP}T)PCc|Nn>A|JSbn&~hYz1dsp{Kmter2_OL^fCP{L5<mhQA%OS)
zH$n+@kN^@u0!RP}AOR$R1dsp{Kmter30&I*8XA8m*eZT0G`_m|eB;xN@7!$J{7BvG
zrl&R?x4ye>SM`^w7pnGE{L=EMFnVpz6RrP$PvFe3&AR<?TkY9f)Ra6kBc*2Mj?aWu
zIWC10QCf5=e19^T2*;(9Qan6A7mKPAuQ3o5Jt2{Vg1(_4F-ThY`qCa+$U*T?ASjZF
zu|chAa+<7;LIVEMK3WLXC!%M-OM-zTwE2+FPsYU&aUev39^bgw>Nyw)hMc4YY$QpA
zsL}i45@{*1r=^9Ql#_8flQfx1Drq$pl@;|?vk6mCMM<jUK8eJobegEMQH6B0ILHyN
zNUd)5i$~f`2Ja^wj@yXnA3U?a5nxtt0H&;@rIgCUx)_+yLjk7GKQ0DCmx&MhygK-9
zZ>VV=Z?g!p5|d7)PsC+a3P<PFq%MbfQehWg(zrCR+1lLJW_eG)?t8wTmm3V&o`WMI
z;cH2&-bq-ACL|IP?+Ourfc{R5j5tlc_xVGj7WIWXmNg4<Or0fb)y_)t%&b~kt>@hu
zP4&998qUe5Xnvjwr{()4G8mZ9CR8v9Do>ZM90>TwLo|i_Au@M7tf~nz7W9pJf|KM9
zanj7dX-W`>)6{I5HcKSJCR$o<`-c2{4-OI(577kj4~XM{o2H*CDQen;+TmE*S#NFb
z>9ITjUGjbjr=^mt(aJWh>rG9=Jey*Vn&<3Hr_y0LR_dIDtb|XMz|~4mmzG9PmD8oA
zrMkr^XVgUcim~L}r1wpyr!A(d4ubiXIww6BRpn?roRX%)>C<V_$`l<#n`)Ym_G~y$
zDf!Y|)%>yftYEb^_w`xcyK6OHweMe+)YR$lR5CuFP|~{l`NG;#U-K$!JKG68mrLwI
zC+g`rshq-Ev2i~M`TUbw5*4y8QCej3G|e?_Fm0A;bM%?bcGAO>#y~fuDCOp9>nU@T
zO!$3=C+LxBH3^IKX^Lw4O0nZj${Wf|KANV&mmf>Mx|-&D`__%6f>T&TggeW|M7^)}
z;=6)l2tPvI;?`<w^WMFd#iOf%nVd;cZ^3${8(=P|DGf+-waG8*EIiXvRGm*rbd-5z
z_24il9`a0#z>&w{si|q6*t;$+^SCVU%th9vw^Uf0TUsm+EUeD0XkLa;Svn`9${c9x
zOo*N>M#EYv73(<V7+v(`X<14^1w^kiCZ(rR@|-HulQ;c<YfQbR#Tk#L)$kNO51XOK
zH+s$(@{O(@;fSHwN>ApdRXHKGw(Op4+ns3J9SiOD-nx7A*4@*0w>Vpr<U)(X;WV%0
zPGVlcoe9&zO{+``Hg{YoO@@?~4%(69QdZZ(;@V$T(>&U;u7?dTutYG#>i)`&H-*4A
znpV;4pjz!(=uD+WKlUy;R)d2(4(PGZl~t663a0@_k22>{Ypkf|HPM+&%|sRXetI}E
zBBQX%nyYByBvUbZs49E-j!Dz9BI{jn;mF6GqoMX$p<d_}8fO}|SkKn9R{gLtQ_(Fv
zC-mB$Y<Rgr-TdF1Pi*R``$6pw%-4^LyWVVVZfdfexkYEE{Z~Ql_l6=icCE-xT8VvH
zj@b(*8Rl~?&qOHT^V3$NqWO42?AkJAC+xozI*HvwFTCt@<@StFLG+A=eS?J_X~|gd
zkT|GmXt~tt>UB8IOg3A$Ptc3K+t#{1F`p`4>g8UUJ_^?;?1C?Fh?uo5^P1(-FZqg$
zNX)18OTL*DU5Dw?#9j4iA*WyY6*YQC;o8r)71(ay5Zg<<`ZJFmeq&w#nN1^?$#Sx3
zLvae%JC_%yMSZLpr|in=YHdvbTV}+?v1N@+aeC8+;*@c@tT-*|qrmnm?^9e!mXmL?
zZa+fT)Bd$`p6v#}W&5QDYPn7dG*?&DBDJfwq!wFIZX?ZAypdMiU_WuOy)^HQz_xi~
zY%eiTi$;Hgt*Aw|kxNG@2vr{Qi3+|)0!RP}Y?#2A&<$|T;Ch|4Sl_z7%)^|ykF}3-
zut|Qk&d1ow3`T5xVFvxxL)_pNz<R@muwLE~4zRwlE*Y*2#<81gn)mFnOzKzN?6y%_
zIx#OPQxYfMSadDURkmuo6HeOFQB5<$V!NQ_$4&6E;Z#9IB!C2v01`j~NB{{S0VIF~
zkN^@u0!ZMRAn>MI?fd^MwpRrDgAXKt1dsp{Kmter2_OL^fCP{L5<mh-;JPAEYpHCq
zXdln7*RKC-g;t?4x%o%+U#tC>+U+$DR{3ZF_jupVTC4DZt=5ViLt{c^cU-T|-8i^I
z`?#I^i<16I+<StWoIl(a^jIqCI}80iK!54O{a<(LN|`ORlFIr2I0EuECjaa%T^4qO
zOlI%a<*bv{KM1nW2C@)6s@LZ_LCy*HLi>XtN1g#WI|j15I-=K;E>`aZIb#Po`@(%t
z&-%-J6J$5*FZ-b>7|+M_H^p>0I|{P!QIPe?)A?m;0ZM*98*c&RNDIgYw8$=y^A9uq
zoveN|toNUJ)v5Dxzi>>KBd>s*|0>A&mq8ZZ19I*+Am?_0EWF6{zsGp|8_1b+Ad~Nc
z9Qiuae}J_=&-CvG+5I@k`5>#m{T{GqKa;;b3HDYq{a=7TvjVGs6#PvtK#=76As+It
zg6s}Y=<S6{rav>L%j7?yUN{+m@qWhI?*!R>9^}YokTc(6^@pH+<OwF9138~y`YoWJ
z4T0?bnh(ZfK}<dgvU?EfGtUgc_#b5bvEa{ivibf2k|93;ay~x_{k@mfTVT8#<4Ybn
ztk=6g2KCwRjq7suagZ~&01x-)K#m*$UXiab`NJURz6o;fyR80OAZHce9r-!Pnav@+
z|6DbTw-m_k8$gb9!gz$q!p+b=TgBqhoWGuZzq=tqgIxG{S5HTGS66TU*>_m2!u#$l
zULf*|cfg#OEymn46SU950<lZ$NDy*cgSv7?gaNZLFi=;f_4yw&`2f@lDj1w=En}e8
zAvXbjf_V-A4~PfI1CDSG;2p#}kasX20B$ZoGO8coJpvANKMnyTJjxt;mL<q%K^9&D
zIrn9ze~bm#vmoc0OwNN$YMFjB<NXa7FJlD20t9f*%6L0Rz#f6|{suT#uz_qiTiC(m
zX6TQ!gIwfa&z9J^jOL_I=>gQ;(ZAGKV-+4aSsXyw0hUy(ugFVad*m4Hvk*WxX&uo3
zxdRN=E2&I8Am>NQ8^jx&+vbG;WE0N*7&2bC3ED>@hxB^)W+p!Ya^^KK$}rY2lDq`Q
zM_9&@heYUq6SJoZ+6!Aj&Kv<bM?fZNkn?Y6vW3~hj)0ke2ie`h+TSy*_b&uN&R<~i
z5u-ggFn15DH}i7;;hFF!zMWeV9tVfGIv3lkt-_h!Vu$3<?FR<w+x6Gn9_Y)Rq^|{s
z)ad>Y4CQQs8L;gDG}e?g2aUc1I?<K`D!YwN*&*89$LiT2Mg6(I^XyM*-gbVk@EK-s
z&yrGQ72eAkOrCV<V`N%9@VdmZ;iTE%ZD4TjBQU$XR!NJB2Gk7X407*s!-UR3r^X=e
z&x1@NP@nx2i;(XFHS!S1!k@qdGt4ORIE<goL4EF-TLI7)Kz671=`wi?Wb)B&nAD%M
z{(j%3%aPBp_MZire3iBTI>;GzjL7{1$oU^K{lA9(B0pyR#ryRBvW?7PYA=k>QasWG
za+WP2!i_z8eV#>p<|XJq_ggkTVdGx_*>H})94<Tt^A(Q6ct!s3kf*jqJn!!l{+2ns
z_w3F}dd%5c9HEAf+*6SKZswcFm&^{|0S=GsWe5E~g8?(of}Hs)CVv=GDDx!9`Om?c
zr+20Wh#yia{|Tti{)Wj9fovF&S6O>=%F*`3!lfjavKcO=`IINzlE5z+{F0LAmK=Ub
z;+HIbN#g_FfL&MyxD3vI5&}Nk4f7X%4q2FIC)K%sJqQ6c!0JCY00GBjVr5GnlOqqq
z5*=YHN|FCOwwM3$_ILhzS@;_kAl*xi6{W`%BdEwn8G9Cd+4Izf!V<=gDUpXjNw3r#
z#v0|H1A{Z)0t3lYYy$6uH9NurCf5q}`M-erB7<pv{Dk-k3mlLVU>r~$IKQOwOD^G-
zWPZgnU>Pf7?rB)D$XPc3W9%69MK(V(FHfV(qG#;ifkQgGmKrUk4iOkL;{?GtVaTq~
zK1&?JWh@Wa=*(~c?;zfRB@X^GcGjg0t~uPnE-StbvhW2stID%umSMDU)|%(edP+`0
zxD|k3A@~)9Ut#zaD8sE#{0deSaGrW`g?!^9pAtUJGQ_1_|5r2|6B=HlPxwFrNB{{S
z0VIF~kN^@u0!RP}Ac0FHut>k(RXD?KP#N3Eh3#gxeXMUY7cO_TKg_0&f)d-4j&OU<
zMrX#JG{0wE)ZYDt4}Mp_{Ck33{&g+AyN+&|-pTEtv0Y8$hJkU1OW)Wp7`VN>f&8Yr
zo4ZROaCZ>|?k+=?y9>eJr7(;#?qayPyBz#okPLU1q~tCNf0u>33sZ6%kGsuh+ygH_
zdJT54NH5z$y#wUT1d}^J&fN=g?jx`RCY%5{bB^s(FEg3#TN_)-{0<zyL&pb5Q2_kC
zy{9UxZJ|B}0no7zKL7uyV0#qYiO)y?2_OL^fCP{L5<mh-00|%gB!C2vz||pex22}3
zsqMrq+6VTvFUHsZEkR06NfP}QU;6)}W>t0W*1dZd78crR^;~r7cyvZ;r~fWsZ+cFe
za&>m>eXDee{_~+U{htD1z1>2J{==!Zq(c7yfKJI``>8;G@PP!701`j~NB{{S0VIF~
zkN^@u0!RP}T(bmrRMX9ay0!KWwEO?{w(khG3$|BmzqS3!_JZxFw)3{<Y~QDa_&@?k
z00|%gB!C2v01`j~NB{{S0VIF~-hc!u>nnx2$~x_-R(rB)Pc_<8wf0n{JymK?71|U1
zQ;s_Q^Z%RL1p0#yB!C2v01`j~NB{{S0VIF~%mm&)W~mbX@@=$8``E1Zuy|Lx^4^sX
zJT<&5pBP?FPC&6x;eGMvKk}}`enQJXKD?~_nw3|2U;OQty(`DJ4KGiBet0?Y3@fkl
zzWC4y?@HbN;pNy{hL?{&&C092FFyaY_r(j3(0-25^7~nNjd#Vq!@Dx~_2K22rQzj2
zeVLV8y({<FyenUsr~S0k^3SmHTJH+^koT8uPYf^LZK354v+_Fcitws;<-y+%FGoK(
zygd60R$fo%Rq0)EpQHW%4=vBKa(qDoNZ>jmpk4n9Tdt!C;Z%?S5<mh-00|%gB!C2v
z01`j~NB{{Sff54R^?zgK1;JL+_)m?a4c}=P-~7*;9<Fb%Yqx%~=3~|8tA0f*@qq-8
z!1YF8QE9Pm4>Z*-RjMg@W=5jl45Gh;V^UmFrSP;ARp(Pucq%!ssA267h_v5<py&yS
zBoy=w4T(Wwf89oQ;yDzenuFqqNE-(H#IE&YCkMqLpPx*O4Ql=A-)WMZNzu_1`c)-r
znv9DfQe*--z{zdIMUHsIph)7$1u504M(>MDVI`W798?#|TC}lR9JdkCKe(v8z5Ik_
zB~534krQ6CkuhPPe_RZPuH=OGT<nAo?6z+2Z>l}h$Gikz8lxczYhNV8JYCMM<u#3|
zmH3jn!B3L$Si4>8*G~L_5b;lpj4&rN{X@aPD6w;0+nEpL7<F1;WU$>nEywJ-!_HK^
z&AR<I8lktZVgLMGEUHS~8<_x8HD<uVZZ#V985D*mw;UjT@d(&LJpMt_s*5l?-?%=v
zK#+`w!oIYam{U)i>K%@Yn>NjY*22in)o~$em+IcCLwI0xT?kzphtPFJAW#?U6f`)R
zyRQg}q6jb&C`f|h*obF9BtCyAP$q!x^o&f1<IF``^f;+8qq1_B*(pxV!qMod;zCfI
zET>ph$X1Yd*&!f%*7Zf_#(mMr`{EE`DYRy2{t+Xy$Q^VAJ#*|9eg0<`)}6m=<MVf2
z+4)~wP`>d3>`Xt=SM0NU*8mJhosIAXO#`nn)xyzNU+-EKX`y7v-dI8#9ln8*vwD}c
zxxe4CxLZFF;0`n4xoC=>SClWk**L5592^l#78usl-l`;Hl9JR`AyGT`84LPGJ;6zG
zhdAkk-Zcfs>7CZ*o*v6$Q&}@K72XJ~?9J<&psB2D06h>!9q-y<ZSL!{EZ$sp=u~n+
zj`3sbh0W^^uIt!a^_Q}<*Q`^Lj#F>3Ht*R}yXYu4JU1{K@lIbbG~pQ`p|a%dWMbUs
zA0m3&RypP*30XNspQER=XIVL|zd!9bu)U`F@E(hxe;;1@M4W!TT{t?gCUrTipIjEc
z0xunQaRpl!7q)4ZTb3YnxN1&pDuTsT64O~I35j=woKwm8d_vJrOQE1($ZA>Pe90M;
zre#G|={NQ1%BvOWspOo*l??>^<DsC3#v_@L!s?VnCj7p`6QbD&Covh|B*lX}9RCYm
C33_V)

literal 0
HcmV?d00001

diff --git a/autotest/ogr/ogr_gpkg.py b/autotest/ogr/ogr_gpkg.py
index 78e882d94892..a14fc22c98f6 100755
--- a/autotest/ogr/ogr_gpkg.py
+++ b/autotest/ogr/ogr_gpkg.py
@@ -10766,3 +10766,45 @@ def test_gpkg_secure_delete(tmp_vsimem):
             with ds.ExecuteSQL("PRAGMA secure_delete") as sql_lyr:
                 f = sql_lyr.GetNextFeature()
                 assert f.GetField(0) == 0
+
+
+###############################################################################
+# Verify that we can generate an output that is byte-identical to the expected golden file.
+
+
+@pytest.mark.parametrize(
+    "src_filename",
+    [
+        # Generated with: ogr2ogr autotest/ogr/data/gpkg/poly_golden.gpkg autotest/ogr/data/poly.shp --config OGR_CURRENT_DATE="2000-01-01T:00:00:00.000Z" -nomd
+        "data/gpkg/poly_golden.gpkg",
+    ],
+)
+def test_ogr_gpkg_write_check_golden_file(tmp_path, src_filename):
+
+    out_filename = str(tmp_path / "test.gpkg")
+    with gdal.config_option("OGR_CURRENT_DATE", "2000-01-01T:00:00:00.000Z"):
+        gdal.VectorTranslate(out_filename, src_filename)
+
+    # Compare first sqlite3 dump if sqlite3 binary available
+    import subprocess
+
+    try:
+        golden_dump = subprocess.check_output(
+            ["sqlite3", src_filename, ".dump"]
+        ).decode("utf-8")
+        got_dump = subprocess.check_output(["sqlite3", out_filename, ".dump"]).decode(
+            "utf-8"
+        )
+        assert got_dump == golden_dump
+        # print("Identical sqlite3 dump")
+    except Exception:
+        pass
+
+    if get_sqlite_version() >= (3, 46, 0):
+        assert os.stat(src_filename).st_size == os.stat(out_filename).st_size
+        golden_data = bytearray(open(src_filename, "rb").read())
+        got_data = bytearray(open(out_filename, "rb").read())
+        # Zero out the SQLite version number at bytes 96-99. Cf https://www.sqlite.org/fileformat.html
+        golden_data[96] = golden_data[97] = golden_data[98] = golden_data[99] = 0
+        got_data[96] = got_data[97] = got_data[98] = got_data[99] = 0
+        assert got_data == golden_data

From 1e7220e9f808ae4d6d5d9e711adfe3d4036879fb Mon Sep 17 00:00:00 2001
From: Even Rouault <even.rouault@spatialys.com>
Date: Wed, 30 Oct 2024 23:45:07 +0100
Subject: [PATCH 05/62] test: OpenFileGDB: verify that we can generate an
 output that is byte-identical to the expected golden file.

---
 .../polygon_golden.gdb/a00000001.gdbtable     | Bin 0 -> 333 bytes
 .../polygon_golden.gdb/a00000001.gdbtablx     | Bin 0 -> 4128 bytes
 .../polygon_golden.gdb/a00000002.gdbtable     | Bin 0 -> 2067 bytes
 .../polygon_golden.gdb/a00000002.gdbtablx     | Bin 0 -> 4128 bytes
 .../polygon_golden.gdb/a00000003.gdbtable     | Bin 0 -> 770 bytes
 .../polygon_golden.gdb/a00000003.gdbtablx     | Bin 0 -> 4128 bytes
 .../polygon_golden.gdb/a00000004.gdbtable     | Bin 0 -> 4427 bytes
 .../polygon_golden.gdb/a00000004.gdbtablx     | Bin 0 -> 4128 bytes
 .../polygon_golden.gdb/a00000005.gdbtable     | Bin 0 -> 1717 bytes
 .../polygon_golden.gdb/a00000005.gdbtablx     | Bin 0 -> 4128 bytes
 .../polygon_golden.gdb/a00000006.gdbtable     | Bin 0 -> 275 bytes
 .../polygon_golden.gdb/a00000006.gdbtablx     | Bin 0 -> 4128 bytes
 .../polygon_golden.gdb/a00000007.gdbtable     | Bin 0 -> 2275 bytes
 .../polygon_golden.gdb/a00000007.gdbtablx     | Bin 0 -> 4128 bytes
 .../polygon_golden.gdb/a00000009.gdbindexes   | Bin 0 -> 116 bytes
 .../polygon_golden.gdb/a00000009.gdbtable     | Bin 0 -> 604 bytes
 .../polygon_golden.gdb/a00000009.gdbtablx     | Bin 0 -> 5152 bytes
 .../polygon_golden.gdb/a00000009.spx          | Bin 0 -> 4118 bytes
 .../data/openfilegdb/polygon_golden.gdb/gdb   | Bin 0 -> 8 bytes
 .../openfilegdb/polygon_golden.gdb/timestamps |   1 +
 autotest/ogr/ogr_openfilegdb_write.py         |  30 +++++++++
 ogr/ogrsf_frmts/openfilegdb/ogr_openfilegdb.h |   2 +-
 .../ogropenfilegdb_generate_uuid.cpp          |  57 +++++++++++++-----
 .../ogropenfilegdbdatasource_write.cpp        |   2 +
 24 files changed, 75 insertions(+), 17 deletions(-)
 create mode 100644 autotest/ogr/data/openfilegdb/polygon_golden.gdb/a00000001.gdbtable
 create mode 100644 autotest/ogr/data/openfilegdb/polygon_golden.gdb/a00000001.gdbtablx
 create mode 100644 autotest/ogr/data/openfilegdb/polygon_golden.gdb/a00000002.gdbtable
 create mode 100644 autotest/ogr/data/openfilegdb/polygon_golden.gdb/a00000002.gdbtablx
 create mode 100644 autotest/ogr/data/openfilegdb/polygon_golden.gdb/a00000003.gdbtable
 create mode 100644 autotest/ogr/data/openfilegdb/polygon_golden.gdb/a00000003.gdbtablx
 create mode 100644 autotest/ogr/data/openfilegdb/polygon_golden.gdb/a00000004.gdbtable
 create mode 100644 autotest/ogr/data/openfilegdb/polygon_golden.gdb/a00000004.gdbtablx
 create mode 100644 autotest/ogr/data/openfilegdb/polygon_golden.gdb/a00000005.gdbtable
 create mode 100644 autotest/ogr/data/openfilegdb/polygon_golden.gdb/a00000005.gdbtablx
 create mode 100644 autotest/ogr/data/openfilegdb/polygon_golden.gdb/a00000006.gdbtable
 create mode 100644 autotest/ogr/data/openfilegdb/polygon_golden.gdb/a00000006.gdbtablx
 create mode 100644 autotest/ogr/data/openfilegdb/polygon_golden.gdb/a00000007.gdbtable
 create mode 100644 autotest/ogr/data/openfilegdb/polygon_golden.gdb/a00000007.gdbtablx
 create mode 100644 autotest/ogr/data/openfilegdb/polygon_golden.gdb/a00000009.gdbindexes
 create mode 100644 autotest/ogr/data/openfilegdb/polygon_golden.gdb/a00000009.gdbtable
 create mode 100644 autotest/ogr/data/openfilegdb/polygon_golden.gdb/a00000009.gdbtablx
 create mode 100644 autotest/ogr/data/openfilegdb/polygon_golden.gdb/a00000009.spx
 create mode 100644 autotest/ogr/data/openfilegdb/polygon_golden.gdb/gdb
 create mode 100644 autotest/ogr/data/openfilegdb/polygon_golden.gdb/timestamps

diff --git a/autotest/ogr/data/openfilegdb/polygon_golden.gdb/a00000001.gdbtable b/autotest/ogr/data/openfilegdb/polygon_golden.gdb/a00000001.gdbtable
new file mode 100644
index 0000000000000000000000000000000000000000..ca18a72cc503a9e7b56890f1a4153f320980aa8d
GIT binary patch
literal 333
zcmZvWTZ+Ok7=@2DBUDiF3gd%oa0}LAQM^F)eFze3A(~?AgS(jx%obdVTW|}WWb8am
z;E?cr=l@9+fX@WX2UDI^<@Ysy1Hn5ACrjJ2Z_roa%@G1M;fXaQPEeHJu2G*Mg(jH6
z;6f!)a7YL|e(ui!JMAb8zP7i#P^YOZq&5j5T1?%VFxXws+z0e5cI8YKsnoGL<d!!X
z-|{(EIMN|$lw^0yl%qq^eo14cxx}3FW12ZTqW%2O{>^vUXxc`sGM(<_(j*>{`{^c%
LBV`ub&6)%+%d$S+

literal 0
HcmV?d00001

diff --git a/autotest/ogr/data/openfilegdb/polygon_golden.gdb/a00000001.gdbtablx b/autotest/ogr/data/openfilegdb/polygon_golden.gdb/a00000001.gdbtablx
new file mode 100644
index 0000000000000000000000000000000000000000..d3c0d9e20c025326ddd07149457a5d60155964f4
GIT binary patch
literal 4128
zcmeI#F%7^l3<XhJ%@Q1CQDDVU4uKI2K@lQ74+T-_EZyemZU2afq)v@)dT7w3MVpeR
hldgL96i`3`1r$&~0R<FLKmi35P~i6hHSxch{{n+G1c?9u

literal 0
HcmV?d00001

diff --git a/autotest/ogr/data/openfilegdb/polygon_golden.gdb/a00000002.gdbtable b/autotest/ogr/data/openfilegdb/polygon_golden.gdb/a00000002.gdbtable
new file mode 100644
index 0000000000000000000000000000000000000000..b4ed54fbfb33e2dbc468c29ecbc6a2f9b9dd2270
GIT binary patch
literal 2067
zcma)7(QeZ)6gBF~#?*C#0WTnA7!w1HLeZ!OZ%&$Y5ot0djx>0QLYs_8O@fkCZ2S|y
zz^CvF`~kvto3u{jLM5w?eeXHvUSD4~000MMgk&_yD1Q4}v)SwU!Up*t^pQ^O=p5Vt
zpaC{N03Td~Zy*6F7?T~yws+2fM;1H~k?ov}6d3cb!lWe=pBMy$_!8K}C?&I)%n3T7
zZh~Jwe*W$ev_D&YMMaK|?M+8>v2uYA@E0GB{lJ?}aya&KF9;_-93^oOUQSal50kh<
zbu2vh<TGR;87r2nn${5E4F{{Ihn&ssKrx33wm!~Hw&f)Hn)+GhkDqXcd!?a`O$!Z_
zn_8@gvHys(YDu~&xxB?g&SGbXE?ia971!22DUB|cAh+MPkWCfy81JafHcf*|V(9jY
zq&%fbVJk5AuW}e9Y2@XVtk8pAc3#O#lOdc@v)i0J_hcXyRL7Kxe8b4Q3ZrSXID;(w
zUOfnw<W;On@B~e#r29PSo3}k&(sn2<Tlp{I=OR3wy2%h(#P`aBbqv`Yo?D7-EBzws
zBbMNJR8|EX_(|mF>2)Q^CzQAzt6F0%X9-WDes@X@rh`pY*9@f)a7YE1(>w_#q}oL6
zuLLM7ReOs-nE@@__td4$>UD+mF;6N@lFyZJpVfGC#lIlJ)bv3aAu{@IJu5rvnozxX
z3~MBu{tI_a@a(kKX%fVuQqDbcKP{vgZqcH4S&N#jm8~lG)6@&&tn68Qt9n_Bv_>+k
z>=kN(PL}igL(@i@+(#JM3NF>QiQ3r?>#OWuFS2fV8Bq=eIqy={B3!<6Eee1Yr@%>|
z-=I8qdHtF}??pvDyPr{#dEtmPq|DD@7_+Ow=lfWQxY7>UGr=OQRZzXoOHW2eOj6cA
b-`HOWJ-E!wADbhgHQ(Oovmy?-{)zbuC0baI

literal 0
HcmV?d00001

diff --git a/autotest/ogr/data/openfilegdb/polygon_golden.gdb/a00000002.gdbtablx b/autotest/ogr/data/openfilegdb/polygon_golden.gdb/a00000002.gdbtablx
new file mode 100644
index 0000000000000000000000000000000000000000..29323984e52a12627a75978f1a2aeaec6f6926ad
GIT binary patch
literal 4128
zcmeI$u?>Pi7>3~wPjLYzK!&iTwY0Hy09b+!!wM#hU}C`n4q#{OOsHvWX?V{a?7mB$
z;&RD<$=lrpNeOmX<BAt%6dW8-p~DRmvXyj3i#O6B9dW@Efk7oIJmS!ev`2$GK5?H)
uHF|uJZ>1Aj3>e|%0xsYJF5m(#-~uk-0xsYJF5m(#@V^2)@qd2;@rDK3`w}_;

literal 0
HcmV?d00001

diff --git a/autotest/ogr/data/openfilegdb/polygon_golden.gdb/a00000003.gdbtable b/autotest/ogr/data/openfilegdb/polygon_golden.gdb/a00000003.gdbtable
new file mode 100644
index 0000000000000000000000000000000000000000..29b5907954ae360324c6349dc8738bb60ca1ed8a
GIT binary patch
literal 770
zcmZQ(U|?VZ;y*yl3dAtL#0+5>Km}NUw7ZL=4@ez|&A`aOz{9}g$>74kz{bMF7R(UD
z5W?Wf5CJ5C3OGOnn;SzSLk>eRLn=_1ftiC9StJr7!XCj8$q>qr$B@ZT0#u2vDhetR
zg-4AqRE;mX8g7u;Aq@FIH>5HY0o{_vkPKvj?cj>SDdUSv=HA+U@1KJF{%0biyQ{yu
zb8xg0kcba=4~{pqv@lW9ad8X@^^I0?K@tiM@Nf<C_jHL?LMSscHn%V|HrF%IF|xGK
zGcq+dGBP$bGmeeb2?+A^bp@K>UX+@eSDu-ip`>G=XAr9s>gO2(G&eOJC=M1fG&eCd
zHnKD_HMF!eF$QV{xr?D8?Er{kXb1z-4J~?(V1CvQFrT60umd{3_wCjN-;W)z`?K=n
jCEug~dl)^qJ<3oC@eq_wJ$sg*M?e2zOX>1*#_%Wr1Usua

literal 0
HcmV?d00001

diff --git a/autotest/ogr/data/openfilegdb/polygon_golden.gdb/a00000003.gdbtablx b/autotest/ogr/data/openfilegdb/polygon_golden.gdb/a00000003.gdbtablx
new file mode 100644
index 0000000000000000000000000000000000000000..4cf22da58b1f323ad40ede26792206c5452032a7
GIT binary patch
literal 4128
zcmeI$xeWjy3<N>*G(i{iUmfQ*(44e^(+Rz*YNVNVx;aiCMt}eT0t5&UAV7cs0RjXF
Ld@a!bfUfrdb<6-+

literal 0
HcmV?d00001

diff --git a/autotest/ogr/data/openfilegdb/polygon_golden.gdb/a00000004.gdbtable b/autotest/ogr/data/openfilegdb/polygon_golden.gdb/a00000004.gdbtable
new file mode 100644
index 0000000000000000000000000000000000000000..e731d3f0c724cebe0b0fbdf92b1e1bb710030c98
GIT binary patch
literal 4427
zcmdT{O>7%Q6n<?T+PbZ(N^pf}4;4}!@A@yXtF2Sl>o{)W*z!6}X%baiXKZg-@0#5W
zafE<Kr3en35C=elUO_^L12=?_Ks_LV1YGzL+(95fmD;MR3U6kA?IvoAxO6Rh=6&yb
zGjHC!c{|<%0DJKJLlj^PTkn}W;`9U;h+tcm3iD@>O#2aRkHEefT!4!}panKmKmvFq
zvOBT}i;i&M=<Y}pHldFUh#rkZ7C?a>6(Uz~j4?P^hc4<ZP;mqWG#DjB>ImCNf=FZ+
zwKyy>KtUUV3K`bX1KXV=0kA7_4^GnI92_2}0y?z8030FmkK@|*1T=REdlsPq^S&6B
z(4jUiM8~g<Ujut^5XKHk*lnSobm2Ox=z;?Ez6y#2m7WXKCRzs*t!Ug##wNI4wzhrF
zGzyNVp$5@?Fg7M*gcK*tdtooPs|+QmK^ck|pk?6P-g#KUo($)4nK{Tp8Z|N8v;s77
z$Eu7ep>{i#40V{rQ5!fe-Do=McDzYQ!4!_2U~L=(oFU0(i({KXT@pVj>`~?_equOs
z1IMi5=#CF%Tr^?9+Q2}yaBb920qAIyzldp4L7%A)G#nk5AFvWQPa5Y)vAOBIbj3Nu
zQtqsKagzbI+Mm$|w$8A2>!o;s$ro=j8D9DP<f#1GA0J=2arLwO?f3rvwtAsH?X{Sl
z_}qMPr<R?vTQ6Mv#D4vi>9^jb&kxhg2A(|m%LkwSc?^ZYjh{a(9b9?;&$&zczdHQx
zq4}zjA6)yAJX*T+*w?wsKYsTSX2A;3IX>Er^y~cb?y<q0$1ATMxV-Y_6YconJJ!`L
zvv%@wa(sRD``eG~Uoy<6t-jJC`<H5sIk{damH@6r#OXm#8-JFVmTKrHxI|pw#);lC
z)>OT7f?I6POk}y~d{mT5{s7}>p<Bo8%|6ks6I|D}`^R~HV`C#uEK`lQj2>^bx}>LA
ze8Fs$D>9!D;wjEG8(8X)%|<G2m>oVT2nqgdbzWwc6RK|6irylD)xC|7YpKkw6P)AR
z@sQVCJ~}=w78P63j80v#yZIH74~ta2p!5ju5WA|jW)i(L=qvgfS<AN-%_3+qic>Q`
zG);_2o<?EbXepY6lU-0O;;Tj8?=t3)?Tpg04Rf<@8l8&dI$`KKY1z0lvTfpqjG`62
zZLme_s)lPe*9qrPB%`Nb2=5L^!8Dc4THDdP<f?MfFwZy{kWY#HkW5{ys(L^WkPwzB
z;naGktq!~)kTDEIMLx6<d7A6=Epzhv>`!mp{^GrO@#SkDeDTe%e|7c`tbXkBp40De
zvEP4Ne*0FmR&JDuaayh|ux_bFy9ZVRm>vN778;Xh3-uCgJ$AqO>?g<!QS5b-6g9=N
zoOnJ&**)s~Hc1}DYWsB0hY2+QA3XhOo?FEBlV56v{ld);>1n!1az&zv###!vnG>9{
z0HErkqI<dJox%X2gjvO^;gIwIh`cLM9KjitXlra)wNs}{#b!kk`5~C{%5_Gg+g}>&
zd}@YhEYzd6cOSUzY|x5%#Bx^Ty`YcdBQi()Ncs|<f*!HWO_BE@x=+lcVlAv|nsPxS
zZUjPr$C)R32TL9vO}FPM>LzKawDwb7Ab4sS%NU_5r{lp5^31ww;(j?H_eG```JqKS
zeh$a@!QvF`-2H?KclRDqa+7H1Fr^>LFtrV1r)2t%uWa@z^ApZ_2i+sKPD1P)+R02N
zHTi#pcbhC5fh?zz&C_c0Tqc>yq!O9rL^hWwOe7MLFmb9>oSYDZtdN<RDyEZ@+2^rB
zGB&2MSUTHu1d;c<6t7rKwMQ!YVzcNUw*a9GxA+BzxGAf>zJ{eKtf5S~tMrM_^UiqP
z&^9}Uj%UtS`((Bm;o`wK0ZNHj0IS}0h4o$VTnIeJVuM<_uh^=hRq&282<eAlYF{vk
z5=<lF%`PlchErI=oKb|bN&}nd_Q<6rk1=LA^U9sOLD~c_U@fvuu^>jrRQg@DRWu9}
zud0eo<W0*aJ+CS+%{BAoQmtH+mpS}7zf_jbCvw>|7n2Ij#p*IAg&?^;TWVn0SmpxS
zWNIp#NKM6sSTdK5Co@w?OrXis>T0ass8mZhLfIrl-%wj!E+)i<)!5=frHSK`4&n?-
zOr<laWG<OW<Z|g0n&R~j-NGnf9M0BEwWI3!iL`*#)sr0EQ$spWa_ydSDse*a9<+8b
zhjGrcb|@n60j@e^kGfD@7ptR|9^hD3hb&#m^QviR#H2U&d^VoT(vz0+o+OzNa@5O!
z<GK<67(pR{?gWTCX8^dAYJfPDS+&y*UR>@SU<h+~OLJHU(y1g?AJ%oyc}yM44(UPw
gGVl;O2gJ`t{4d1Y?5_|0y9nQN=<OjNr5DnF0oB0xVE_OC

literal 0
HcmV?d00001

diff --git a/autotest/ogr/data/openfilegdb/polygon_golden.gdb/a00000004.gdbtablx b/autotest/ogr/data/openfilegdb/polygon_golden.gdb/a00000004.gdbtablx
new file mode 100644
index 0000000000000000000000000000000000000000..0209415ef5ba16a5bbf5b75bb1a4313de07e7912
GIT binary patch
literal 4128
zcmeI$!3h8%3`N0+V!0M#N88>0v>_9CkX+}OSydy=w9?J;rL%Jw0t5&UAV7cs0RjXF
O5FkL{S%LlsbUi<)AOXt&

literal 0
HcmV?d00001

diff --git a/autotest/ogr/data/openfilegdb/polygon_golden.gdb/a00000005.gdbtable b/autotest/ogr/data/openfilegdb/polygon_golden.gdb/a00000005.gdbtable
new file mode 100644
index 0000000000000000000000000000000000000000..d717f9fa1a7ccca3e538f9d5fa8cf85a6a3424ce
GIT binary patch
literal 1717
zcmZ{k4NMbP9Ka7KEjXa^VO-p@RYz-a&LOp9%rEE%r3_jqWiDZ2UXMpTXnXD570MKY
z2#havMCTT5$&eD`G9)H~qK;)QBbi3$f|%(lnaef<ldwUirX#X<mFo7AUGj3T?|#4Q
z{XgD|A_zi)MmZYMXoRmp;gebV;(`F(b()+4p5y041b>e)5*DJ0fCNc!gr3k4gisJE
zC?!gt{8%Cr#1#>MU=ZOzgqf%#oM;{*(L;d&J+z|*Lhxzmy%YT-Gg34g?~iVJw12hg
z;OAQ}PlyyPXz4$fK+i$@YP3Xe@yuO@DcXI@u4Ogf9y^VT!d%)`0hu&}%DvYYoR6i3
z4|fk7XfOX}4o725kfq%W3FQcG>%J{-Sg;qi$qYZA2G)997HI$nSjeR!_GbN^zoq$S
zvyZO0|MmCFQ+2o-Wu_eK5NBE0bt8F$L+A6anYr*4m4$QhITn^<0Lhtw#Re6~xO+l4
zR2KBAo4gKh>fVxFxRGRnHo#G|gWX9vC2AXBS-xI+7t0L>wyD|#!;B%LFCEv!xe#z}
z21*u#$UshIyNewjXnnh{_`BeeyD|yxB<dh-ha5wZ5(DIFX{MUb>6pn?Q#mbFf555Y
zbFBsuZpD~sCvBsxb+02B%F=(U&IxKp6ozG8)jbh7oKWIsYGB>Jzp6FTKi(Z#sq5T3
zu@xj8a^Xs13Aq#6Q3}Yg#TV~=>V+SNHdd)VIqtaUA3ZgOTOu<}+bpza4PwQQ+0h2Z
znBANqac}=&UWl`?+fb;i6OeENqH7;?2YOeW(6(Pqf6EkkJ%H26PicwP!$Aj&YQtwY
zN|shDKD1qT%6RT$v1;xx?j=5@!*TG<J^g29i=eYI&!7L|IeEt>61RjJ+72iOAM$~|
z8-|xH>g<oEo5km~^A~YcWP(l`MFPG(eqY&it8qM8ceqKNUKq3V0nUmS+3=_E0sJyl
z5fBHyP@nm0_Lb(qAEmgHm`AZ3jk*KaB!z&fhKvkZ8!rm4`aH!2p9THau2<V2z5}8|
zozCx#SP$!5+Z>hOy=v|Ksj_21oQw~*Y^cfkZtAt(EI;Wzu4zcSRW_R`7{-kR6JR;W
zgoEIfhY>@{YubhU{<F1_WBZOJ{hy*ndmA+^pR4+LS8Ye{n6Xhx`j$`jtj7x}Ufc}f
zzu{|<=~mWMC<gNUBc4sW<%3snM`VT!gX+e|O(`0BFYO@p=YL9`nUB50;%s~oKqrz-
zk_#*-xk_a0N1;!}RT+K3+=0fUn`dz&1yzrMEOc<8li}H^pq$#}X~{S0FXR<HcO<3k
zU0f6|h8+%t&)AjCIgj#IDfM4YuHTWhr+6vuBoxvtc((aoLrUvO-=!^`549cdJ^$-V
z_eRd(O&n(e4l6o(Laoama!DO;bgt{!Q*h+XZ$}N(<Uw3WQqvW%Lh=r<xx?B&#+i}A

literal 0
HcmV?d00001

diff --git a/autotest/ogr/data/openfilegdb/polygon_golden.gdb/a00000005.gdbtablx b/autotest/ogr/data/openfilegdb/polygon_golden.gdb/a00000005.gdbtablx
new file mode 100644
index 0000000000000000000000000000000000000000..259a71f00d2c0c98bd6514a9ba34004cc6a172c1
GIT binary patch
literal 4128
zcmeI$p$&sT7>41m<=O>s)HJDM1Pl{2)mRKeG{LX|igt%e6p{s)g4A_^3SJlj^<DC)
zKe<a@_g^IGfGYx8444r!$Wh^i3E?PZsPM*wKhj7l(P6|7H!dvDpvM=Nlk~t7AIwP3
m(j7H?T)+ifzy(~u1zf-dT)+ifzy(~u1@<Yxn!o)CtgkJN%@DZ&

literal 0
HcmV?d00001

diff --git a/autotest/ogr/data/openfilegdb/polygon_golden.gdb/a00000006.gdbtable b/autotest/ogr/data/openfilegdb/polygon_golden.gdb/a00000006.gdbtable
new file mode 100644
index 0000000000000000000000000000000000000000..7c8b607cb27b9d502ca53da3128ea33a7e10d84f
GIT binary patch
literal 275
zcmZQ(U|?Vb;%Pw43dArV3=#wa0|>>y0;Jtt9DP9QKx~jEb_NcAh9rh8hE#@Rh7tx(
z1{VefHWnt9P=-(lhg*#asG^7=lOY|5^B{s;YAkFnKrO{EEkFU55Qa*I0-zuRki+E&
zlqmsgOJXPmvOsDXco<l@0)T?~V0A@6;Y^4GBMU3Ty|w$^y8<2c_l}zTnI)WQo(-E*
zlwvoRuDZM4l_zG^d+%n>v;6aUeSAG`PyKUbrJi%!=e0*JzWk8B`eVhNCnjs0ce%^>
Lt-kR5ILOxkzrH=A

literal 0
HcmV?d00001

diff --git a/autotest/ogr/data/openfilegdb/polygon_golden.gdb/a00000006.gdbtablx b/autotest/ogr/data/openfilegdb/polygon_golden.gdb/a00000006.gdbtablx
new file mode 100644
index 0000000000000000000000000000000000000000..92567c81d8c1463bcc7e6f912a8114cd652bebaa
GIT binary patch
literal 4128
zcmeI$!3h8$3<JQ3?mw7Q*@7-1R|t9Mo}Ed?&8prNM}PnU0t5&UAV7cs0RjXF{4enS
H0loA9PI~~>

literal 0
HcmV?d00001

diff --git a/autotest/ogr/data/openfilegdb/polygon_golden.gdb/a00000007.gdbtable b/autotest/ogr/data/openfilegdb/polygon_golden.gdb/a00000007.gdbtable
new file mode 100644
index 0000000000000000000000000000000000000000..3e11ac8a531f4d811a970f84358edd1ddca14a9c
GIT binary patch
literal 2275
zcmah~4NMeA6n^M|p!^*2b5&4Q8YvC7B4BM1wQ$G53EUNWq&1{5%iZ3wa_n-u3mg#}
zi~Jozp^Y@D6j}lV!fz=ttwe2VNNpsxwA9!tAk{`%T4GVt5<+8V@AiOOO>dLQ%$xb%
z`@VVeW_CRQNKu<b%^fwzch}QCiWVj$s83NOrt>&{3~m6P;HiOJI0gjhfCCjM06Zk?
zB-xN{H#SB{d^NxT2}U%*AzT4yp%e<y$cE+t3S_W0_oo9SYQT{nfp|Ybyzbzw1R4b`
zLDLNAAszL(XxW4Sk8J>-Bs8Ui{QrZf5T_i)p%62U1X4B%-U3+X4%7XY?(5L-udZRK
zkAC;P<j?zdUq1dIAb~z;d3cowOg<I#?JJcxCx*nfuaY+D2}VWPd+BR9Lr@%Mw=j*t
zPo77b^sBZT(vD~bzZ!q*z}TFoE^};u_l>e!;;`5I-76OBpG#`!J9Kq=Lu{3|iX#df
zBtCMQ;&75;F@82cun0Y-q68v0u<$HL;&p8!%2Y+Q$4|X?KKt}Vm6_$vO}>$2m7HhP
zF`d!Y)|-(h9~Wd;Bh+_>aN`M?0_Sj+;1Cgkb43gx*wPh3VIstt(+TW`vWvKT7c7&%
z#`VcZQbN_e<Fi-9I}~&Kt#aEk`UGd-Dkn){0@@Aoc(PlyGqOEV*D@NWZfCXDEXu{f
z=NI->N$L!#9fw1|h&-L3^Hb0TD1j<U2;;Z9LEFMpxd7D|zp?%tCMzyP)zy^!@C9S|
zM`A9q9V~FLS%eAa2)#Csp^J=pUK)lpqK#+_z6yfnNDAjjnl}MX@ifw*n(<9ro=>{?
zkM-ZF!0+cKvdqPX`JM7=F3Cq}RB!OIu!XxS*B@>T-}UEQPV-W*aAa~5&ax^>OBd26
z+E^O)dLason@pux22SBdg2e<NW<x%M3YH^vBtoGjR6q_#fHS;|!n5q8x%s!msye89
zL<iq6)SB7zgEeWLncppK{3QYt%h>KLBZZ&~2#z6jYJw}FnS57rc8KMy1i*v{0FAsZ
zU1ikp2b*Ft-XHbUKRfVY?ulh{$?5h9P5Dk;i|qHdJz`y<&bowP!qK==6(&|CxI3y4
z(96T;N?F_J@BejbIT@gtI&@Da*0leE$F&wqW_r(3hw;5AD-oo{b4`Sbau(y`I49ex
zYwCA+>d8kxWXr~4)ZJH7j%=?pPwy+*Tda>AOzpU2NhpgvHzrO;kaIe;1jC^F;3^L?
zaUKLefI418Pfq-snR^EkYrFj`%^$yPC1X#vUp}f+jm!kxU-p_}#Ui#ki$KA3gh`3#
zqH=MiWSdxt-5Ilc1A6R{o|($VqYW0Vd>|q2oA|f8D#Uuk=Mf;B$66uIr{RjWny0<1
zXr7H-+?>|f_L6#ZYV@FZ+w}nLbKb`^ElbM5^0NmXgk}+i48fw`hV2Q^7}lo6QMdNn
ziaA5KteEKW7rg91fMQ<C!q(yXvfDAr9%;g8nl~BvZs(8cuw418P4kt0BgxAb{@79R
zbNTgvlrt6b=nPNk-!c@?MCx$iUZ^O+-YoR!`Eu0!6SW{f+@OqSj-3u4znGk!RP$v{
kNPg4t2{DuSm2~`iGzMo0Phs9BsKaOjpATN|8eIVY0jq>ffB*mh

literal 0
HcmV?d00001

diff --git a/autotest/ogr/data/openfilegdb/polygon_golden.gdb/a00000007.gdbtablx b/autotest/ogr/data/openfilegdb/polygon_golden.gdb/a00000007.gdbtablx
new file mode 100644
index 0000000000000000000000000000000000000000..8a2df70bfbfb25c52f017037fbeb1b12dd6cef4d
GIT binary patch
literal 4128
zcmeI$I}Lz95Jk}k%dezI0vgb<9StZ!3r<h~CIA<_kwJ1Nxy2-#ElxQ}QsH`6xxvfk
z9=Dd#>82#@ZuEo0<yyL2Zlv4It@OI3^9Kw92q1s}0tg_000IagfB*sr{6Zk^0a45c
Dy*&u_

literal 0
HcmV?d00001

diff --git a/autotest/ogr/data/openfilegdb/polygon_golden.gdb/a00000009.gdbindexes b/autotest/ogr/data/openfilegdb/polygon_golden.gdb/a00000009.gdbindexes
new file mode 100644
index 0000000000000000000000000000000000000000..cc24e2a06b9b9cd5f6fe51035fb99fb73263853b
GIT binary patch
literal 116
zcmZY0I|_h600Yrc6cjAHl3&(sR?y!2pJE*gL9lo!5>f)4R%Q~HMj{InCz~Dg%DFHb
V{rA@PeIEn{cK=Wbo?or+as&A53=;qV

literal 0
HcmV?d00001

diff --git a/autotest/ogr/data/openfilegdb/polygon_golden.gdb/a00000009.gdbtable b/autotest/ogr/data/openfilegdb/polygon_golden.gdb/a00000009.gdbtable
new file mode 100644
index 0000000000000000000000000000000000000000..5cde63fc1c8e5395467db1421f39199af3bf5bd7
GIT binary patch
literal 604
zcmZuvJx>Bb5PioToQXfco|+Jj!w+I2BA7$s3E?0XYKSO)6`If(yDN=_m93qH7XAPm
zbBQ$#g_YV^nkdG`g2K0ZOJbCr+|2CE+cz`2rvuRVy;4PKhig3&dMOn!Xq{qyS>S|k
z=zxx>hZ6FrVi7Q5#x;jt1r7@Eu!gwunpmahp$Q9F%pk)|0!5yzqXAbUi_DHfd1Nn<
zwMO10o)N60a8s}`O>RrtBvP!9merE9lgy<lHa&sdpd`p#Bd1IDum=Ynv?+%;#y+;N
zC;Lo6unBwvE5z4TzisR)yqG3LS*(*`4O`Y0b;ZP4#%xv93@`K~_-)(WZeH#WzQg$G
z!~Ea<*C4iiRxZp9Za#Z%tC|~HiTkJexiCb_7gMXB8>?UeWe!~67~(n+Ga~v9jws??
yHU0!IxAz~So55_<(EQ;2yf?1&#oLqaOXWXbUQPH?!@qvKGo=cICl&P8X6FYhe{cH$

literal 0
HcmV?d00001

diff --git a/autotest/ogr/data/openfilegdb/polygon_golden.gdb/a00000009.gdbtablx b/autotest/ogr/data/openfilegdb/polygon_golden.gdb/a00000009.gdbtablx
new file mode 100644
index 0000000000000000000000000000000000000000..c8318b15c83414f99a0dd89e985d8e946dd01ab3
GIT binary patch
literal 5152
zcmeI$!3h8$3<JQZID`9dY6V?Fu8{QIdv+vgH?#KYRfYfo0t5&UAV7cs0RjXF5FkK+
N0D*4>-aqghxB=4G044wc

literal 0
HcmV?d00001

diff --git a/autotest/ogr/data/openfilegdb/polygon_golden.gdb/a00000009.spx b/autotest/ogr/data/openfilegdb/polygon_golden.gdb/a00000009.spx
new file mode 100644
index 0000000000000000000000000000000000000000..44769469081c5b257ffbc01a78a1837d109dc375
GIT binary patch
literal 4118
zcmZQzU|?VdVh~_N!eD_>Vl)IsLtr!nMnizy5KsVW6abd1U>e9?0OAAjC>RZa(GVC7
ffzc2c4S~@R7!85Z5Eu=C0SN&P2UxEkc@zKu*C7FB

literal 0
HcmV?d00001

diff --git a/autotest/ogr/data/openfilegdb/polygon_golden.gdb/gdb b/autotest/ogr/data/openfilegdb/polygon_golden.gdb/gdb
new file mode 100644
index 0000000000000000000000000000000000000000..506f9c6282948917a4e1036b503d10efe5d0ae2f
GIT binary patch
literal 8
PcmZQ&U|_hncHesd2pR)E

literal 0
HcmV?d00001

diff --git a/autotest/ogr/data/openfilegdb/polygon_golden.gdb/timestamps b/autotest/ogr/data/openfilegdb/polygon_golden.gdb/timestamps
new file mode 100644
index 000000000000..05d2b9440ec0
--- /dev/null
+++ b/autotest/ogr/data/openfilegdb/polygon_golden.gdb/timestamps
@@ -0,0 +1 @@
+����������������������������������������������������������������������������������������������������������������������������������������������������������������������������������������������������������������������������������������������������������������������������������������������������������������������������������������������������������������������������������������������������������������
\ No newline at end of file
diff --git a/autotest/ogr/ogr_openfilegdb_write.py b/autotest/ogr/ogr_openfilegdb_write.py
index ad2ccf036bb6..d2b0486267a4 100755
--- a/autotest/ogr/ogr_openfilegdb_write.py
+++ b/autotest/ogr/ogr_openfilegdb_write.py
@@ -13,6 +13,7 @@
 # SPDX-License-Identifier: MIT
 ###############################################################################
 
+import os
 import struct
 import sys
 
@@ -4571,3 +4572,32 @@ def test_ogr_openfilegdb_write_OGRUnsetMarker(tmp_vsimem):
         lyr = ds.GetLayer(0)
         f = lyr.GetNextFeature()
         assert f["i32"] == -21121
+
+
+###############################################################################
+# Verify that we can generate an output that is byte-identical to the expected golden file.
+
+
+@pytest.mark.parametrize(
+    "src_directory",
+    [
+        # Generated with:
+        # ogr2ogr autotest/ogr/data/openfilegdb/polygon_golden.gdb '{"type":"Feature","properties":{"foo":"bar"},"geometry":{"type":"Polygon","coordinates":[[[0,0],[0,1],[1,0],[0,0]]]}}' --config OPENFILEGDB_CREATOR GDAL --config OPENFILEGDB_REPRODUCIBLE_UUID YES -f openfilegdb
+        "data/openfilegdb/polygon_golden.gdb",
+    ],
+)
+def test_ogr_openfilegdb_write_check_golden_file(tmp_path, src_directory):
+
+    out_directory = str(tmp_path / "test.gdb")
+    with gdaltest.config_options(
+        {"OPENFILEGDB_CREATOR": "GDAL", "OPENFILEGDB_REPRODUCIBLE_UUID": "YES"}
+    ):
+        gdal.VectorTranslate(out_directory, src_directory, format="OpenFileGDB")
+    for filename in os.listdir(src_directory):
+        src_filename = os.path.join(src_directory, filename)
+        out_filename = os.path.join(out_directory, filename)
+
+        assert os.stat(src_filename).st_size == os.stat(out_filename).st_size, filename
+        assert (
+            open(src_filename, "rb").read() == open(out_filename, "rb").read()
+        ), filename
diff --git a/ogr/ogrsf_frmts/openfilegdb/ogr_openfilegdb.h b/ogr/ogrsf_frmts/openfilegdb/ogr_openfilegdb.h
index ae2a5f3e3d0f..36c29b23f3a9 100644
--- a/ogr/ogrsf_frmts/openfilegdb/ogr_openfilegdb.h
+++ b/ogr/ogrsf_frmts/openfilegdb/ogr_openfilegdb.h
@@ -28,7 +28,7 @@
 
 using namespace OpenFileGDB;
 
-std::string OFGDBGenerateUUID();
+std::string OFGDBGenerateUUID(bool bInit = false);
 
 int OGROpenFileGDBIsComparisonOp(int op);
 
diff --git a/ogr/ogrsf_frmts/openfilegdb/ogropenfilegdb_generate_uuid.cpp b/ogr/ogrsf_frmts/openfilegdb/ogropenfilegdb_generate_uuid.cpp
index 4198263416ac..a8aa87b1341e 100644
--- a/ogr/ogrsf_frmts/openfilegdb/ogropenfilegdb_generate_uuid.cpp
+++ b/ogr/ogrsf_frmts/openfilegdb/ogropenfilegdb_generate_uuid.cpp
@@ -54,7 +54,7 @@ static int CPLGettimeofday(struct CPLTimeVal *tp, void * /* timezonep*/)
 // Probably not the best UUID generator ever. One issue is that mt19937
 // uses only a 32-bit seed.
 CPL_NOSANITIZE_UNSIGNED_INT_OVERFLOW
-std::string OFGDBGenerateUUID()
+std::string OFGDBGenerateUUID(bool bInit)
 {
     struct CPLTimeVal tv;
     memset(&tv, 0, sizeof(tv));
@@ -62,57 +62,82 @@ std::string OFGDBGenerateUUID()
     const bool bReproducibleUUID =
         CPLTestBool(CPLGetConfigOption("OPENFILEGDB_REPRODUCIBLE_UUID", "NO"));
 
+    if (bInit)
+    {
+        if (bReproducibleUUID)
+            nCounter = 0;
+        return std::string();
+    }
+
+    uint32_t nCounterLocal = nCounter;
+    // From POSIX.1-2001 as an example of an implementation of rand()
+    // for reproducible output.
+    // We have to use that rather than relying on std::mt19937 +
+    // std::uniform_int_distribution since they don't given the same output
+    // from the same seed on all platforms.
+    const auto reproducibleRand = [&nCounterLocal]()
+    {
+        nCounterLocal = nCounterLocal * 1103515245U + 12345U;
+        return (nCounterLocal / 65536U) % 32768U;
+    };
+
     std::stringstream ss;
 
     {
         if (!bReproducibleUUID)
+        {
             CPLGettimeofday(&tv, nullptr);
-        std::mt19937 gen(++nCounter +
-                         (bReproducibleUUID
-                              ? 0
-                              : static_cast<unsigned>(tv.tv_sec ^ tv.tv_usec)));
+            ++nCounter;
+        }
+        std::mt19937 gen(nCounter +
+                         static_cast<unsigned>(tv.tv_sec ^ tv.tv_usec));
         std::uniform_int_distribution<> dis(0, 15);
 
         ss << "{";
         ss << std::hex;
         for (int i = 0; i < 8; i++)
         {
-            ss << dis(gen);
+            ss << (bReproducibleUUID ? (reproducibleRand() % 16) : dis(gen));
         }
         ss << "-";
         for (int i = 0; i < 4; i++)
         {
-            ss << dis(gen);
+            ss << (bReproducibleUUID ? (reproducibleRand() % 16) : dis(gen));
         }
         ss << "-4";
         for (int i = 0; i < 3; i++)
         {
-            ss << dis(gen);
+            ss << (bReproducibleUUID ? (reproducibleRand() % 16) : dis(gen));
         }
     }
 
     {
         if (!bReproducibleUUID)
+        {
             CPLGettimeofday(&tv, nullptr);
-        std::mt19937 gen(++nCounter +
-                         (bReproducibleUUID
-                              ? 0
-                              : static_cast<unsigned>(tv.tv_sec ^ tv.tv_usec)));
+            ++nCounter;
+        }
+        std::mt19937 gen(nCounter +
+                         static_cast<unsigned>(tv.tv_sec ^ tv.tv_usec));
         std::uniform_int_distribution<> dis(0, 15);
         std::uniform_int_distribution<> dis2(8, 11);
 
         ss << "-";
-        ss << dis2(gen);
+        ss << (bReproducibleUUID ? 8 : dis2(gen));
         for (int i = 0; i < 3; i++)
         {
-            ss << dis(gen);
+            ss << (bReproducibleUUID ? (reproducibleRand() % 16) : dis(gen));
         }
         ss << "-";
         for (int i = 0; i < 12; i++)
         {
-            ss << dis(gen);
+            ss << (bReproducibleUUID ? (reproducibleRand() % 16) : dis(gen));
         };
         ss << "}";
-        return ss.str();
     }
+
+    if (bReproducibleUUID)
+        nCounter = nCounterLocal;
+
+    return ss.str();
 }
diff --git a/ogr/ogrsf_frmts/openfilegdb/ogropenfilegdbdatasource_write.cpp b/ogr/ogrsf_frmts/openfilegdb/ogropenfilegdbdatasource_write.cpp
index f6e54d500998..27ba3a2dd9f4 100644
--- a/ogr/ogrsf_frmts/openfilegdb/ogropenfilegdbdatasource_write.cpp
+++ b/ogr/ogrsf_frmts/openfilegdb/ogropenfilegdbdatasource_write.cpp
@@ -1299,6 +1299,8 @@ bool OGROpenFileGDBDataSource::Create(const char *pszName)
         return false;
     }
 
+    CPL_IGNORE_RET_VAL(OFGDBGenerateUUID(/* bInit = */ true));
+
     m_osDirName = pszName;
     eAccess = GA_Update;
 

From 9a71eff3be6f437f5dd25a671b2494b5395ad0ad Mon Sep 17 00:00:00 2001
From: Even Rouault <even.rouault@spatialys.com>
Date: Thu, 31 Oct 2024 11:24:49 +0100
Subject: [PATCH 06/62] test: shape: verify that we can generate an output that
 is byte-identical to the expected golden file.

---
 autotest/ogr/data/shp/poly_golden/poly.dbf | Bin 0 -> 530 bytes
 autotest/ogr/data/shp/poly_golden/poly.prj |   1 +
 autotest/ogr/data/shp/poly_golden/poly.shp | Bin 0 -> 4580 bytes
 autotest/ogr/data/shp/poly_golden/poly.shx | Bin 0 -> 180 bytes
 autotest/ogr/ogr_shape.py                  |  31 +++++++++++++++++++++
 5 files changed, 32 insertions(+)
 create mode 100644 autotest/ogr/data/shp/poly_golden/poly.dbf
 create mode 100644 autotest/ogr/data/shp/poly_golden/poly.prj
 create mode 100644 autotest/ogr/data/shp/poly_golden/poly.shp
 create mode 100644 autotest/ogr/data/shp/poly_golden/poly.shx

diff --git a/autotest/ogr/data/shp/poly_golden/poly.dbf b/autotest/ogr/data/shp/poly_golden/poly.dbf
new file mode 100644
index 0000000000000000000000000000000000000000..ad76f9f42a5bb8bc184bbc7ef30282cbb181d274
GIT binary patch
literal 530
zcmZ9Iy-vh15QKfYjvtAdDkTk;XV<%Hmk}2v8ibC5l7<)I&9M{P<jzH`%zQIm`}lJ>
z+&Jfc-BbBqtuN>1^D;N}z3z9%OJ<orzu!))w)(f1w);50E~l;HrCJYH{{1vf1R=_v
zRLjr0pot|!jzE{nPSKc>hfx<{dE<z?R5^tSc}pV<L+gt>PNGRRkCeIy<Hn6qss%9<
zZ?fa4anhp|3y9?&*{qAuHXp{w#Uf4hT%{3)>ed%>kCr;+n3z4=KPTI}u<p2L$*KsR
ZF&+Ubt#6FH_DVV0ryXaZ-D5-_{{XO%NreCa

literal 0
HcmV?d00001

diff --git a/autotest/ogr/data/shp/poly_golden/poly.prj b/autotest/ogr/data/shp/poly_golden/poly.prj
new file mode 100644
index 000000000000..fec0ee28909b
--- /dev/null
+++ b/autotest/ogr/data/shp/poly_golden/poly.prj
@@ -0,0 +1 @@
+PROJCS["British_National_Grid",GEOGCS["GCS_OSGB_1936",DATUM["D_OSGB_1936",SPHEROID["Airy_1830",6377563.396,299.3249646]],PRIMEM["Greenwich",0.0],UNIT["Degree",0.0174532925199433]],PROJECTION["Transverse_Mercator"],PARAMETER["False_Easting",400000.0],PARAMETER["False_Northing",-100000.0],PARAMETER["Central_Meridian",-2.0],PARAMETER["Scale_Factor",0.9996012717],PARAMETER["Latitude_Of_Origin",49.0],UNIT["Meter",1.0]]
\ No newline at end of file
diff --git a/autotest/ogr/data/shp/poly_golden/poly.shp b/autotest/ogr/data/shp/poly_golden/poly.shp
new file mode 100644
index 0000000000000000000000000000000000000000..98951531b18cf595d2c45c153f09979fd8ac67cd
GIT binary patch
literal 4580
zcmZ{oZ)lfQ6vyv$Yb&KpOldK93u>0LY?@ZMygD^)&UCp}Hs>vkg0iR$eG_;iD+rbp
zm7)euqQN9|gqD=yCG~|us06E^u?rfmUgY{ig?nMY=YGF;KRdI%**>0g|DAL0_nt?k
zYOYaN{HgeTV6;+Yu<3nwSIsN@sm;{p{I@yybmZ)pX_2ou!)5<iQuzB2=TfgM!d!P^
zte@CioGW*nP#kFQS_0oq^qulx<nS)n<y>|j*qwk~e+Je&m%{&hu<Zh8eY9^{R$RNV
ztiJu>$nGTU>c9$&Zvm&hx`p5{3F9@-qUSZie<|&aVA~EhYiaKUXBW7?F8VJce?;zS
zjPuz;V7K)l>@S`76yoFFhh4^N)Mat^JLbf(eg=F~Y+@{A+-00Z9A%tSPE#Jd$dlZc
z-Le?xTF<?dBX6n!ta=)vJtO;X=111aer{gmFdzQeuDS4^2{s+z;j=b95bd6M)Rpy-
z&5Q64y@*Z5{iIlh4Y2#qF<x&5yLQxYI0%+?Rk%4N>F$`>7;AcKad%h)F{gRltNy><
zoyf1ue-QN%ME%J6lJyrved-d_pKoJahEQ*+mFI3FH<N!r-pq8cE@2+0kvkZt+vwj4
z_8J^Y!D=G?yB5a0xH06x1(9tTIBOseGM3c_c}%foto>B@%Q}#CqC_2ys54QATGXkm
zV_D~=h13oR4Rx<!@2-Y5oYz=h2bIM-ko70)Gu71aIYk`O+~!lPr7hx=eGJaN#r@D-
zV08y-H0@1y^FDak{n7hloc#f?>9{wt-UHS<XGV5!bH2Z4M7E!B{C==L%<)IS?gZ^$
zgTs&HZJ5vYgVpo1<9yk*tc%WBSicHvZvbbLz-s(G(H>YgZV>bP-?+Z!`a|IKzA0*>
z+Jp0|=P;klN1oEf%xlWca4x7lQCsr7URBI%@=ZD;bw~@HlJzU|F6YaaO7E<^E9tF@
zZ;{aZ5jr5>G4alYSMe?>#@|;$&s{m<mGx5xCNMu1_uPyn?*+R{h`-#c7WZBYU68ur
zgsy0zJ5raN&@Cr)EePEkp^HK{%L{rxF4Y5L<6r05UTBTA=<>HGk9&2o9%Mg}ty@|2
zEx3h#ay{avI$NUMPXU`hABF!o+Hb^KSt;1o!Czkw4liRqLv~}qKItp>YR_Ts{xs&d
zyr*_6*dK@8*3<qa*xiivOb^d~=8?F+tQ)Miqn|P#f=vnftrRQQ$$o4*?cK1e-C(zQ
zd7Mx02dhD_|APMQoc|O!kaZteSJ0n2ksZT&j`2_%(VuzNv8rXis2J~Kto!R2PjL^@
z9ZUV7%x{HKqf%es$Kx%NJ#|=7m^-hN`eL2PJcplHQ<dDu*~emB)U9Cs>?*{)miD*6
zitn;|@`-5ob+CK28sphd*`wfWIqas)M!Tjy+FG!?1@`RxQ?PSBcN}a|fA+yztha>z
zW7b8^n!su&IG(+5Y*&Hri!cXvyAf<_!0G$LR)fP9`cJ04X-%BpU1VLHZ;x#MVqF{t
zhd;pT4Y0oe_SInZGyTsn9^ZjoFIb-j+avVno%en8Uj@7A28U*_pWBB0%psqA9Q!S!
z{{ZrtDcW}<-+Bn&U;aMayKs@;V{35l)M)xY!+Y{8>}mt<kv+@tlfkABY{!D#3ECH7
zyxmLg1&8(YuYrHI5^TQWcN)JB^)9ek0J}Si_rSi#`B?j|j_3UccOf)@L%==MZ_+Mw
zLh48+bVlls)G4WBQs*+EgHk7@j!K<XLWkx3y}D52gM2Rxd$$yeq*?K8&2`|Wq7KM+
zR=&eV=#$hhsc%yMq&}v-B>gl(U#0#^eKu0RrM`>2j>x;Hi=>$;R_YSI6ZI_AiE0G9
z7V>nkUk&y<@ExgM0=r)H;O-qVd-Bvbr`)7(H`%9H(Z|UA<o~x~m}ko#DD^zaSLHoY
Zo@e^y*-p09Psw%CT&bUu{nmfR{|ih=Uz7j<

literal 0
HcmV?d00001

diff --git a/autotest/ogr/data/shp/poly_golden/poly.shx b/autotest/ogr/data/shp/poly_golden/poly.shx
new file mode 100644
index 0000000000000000000000000000000000000000..134898b3895e74673319a066c79e8def27e1e538
GIT binary patch
literal 180
zcmZQzQ0HR64x(N#Gcd41<P_E!$~pp>3J%&qAlhMTDwx0El`fc%-2{ZF5s<k9h~GhI
y##Iaqj6f#SIw0|afq{7zkoEx5&mi(Fi-7V1K-;$g)c}RqcL9kH3=A9~^$q~qb`x#@

literal 0
HcmV?d00001

diff --git a/autotest/ogr/ogr_shape.py b/autotest/ogr/ogr_shape.py
index 889d0c688abd..d777a11e41bc 100755
--- a/autotest/ogr/ogr_shape.py
+++ b/autotest/ogr/ogr_shape.py
@@ -6134,3 +6134,34 @@ def test_ogr_shape_read_date_empty_string():
     lyr = ds.GetLayer(0)
     f = lyr.GetNextFeature()
     assert f["date"] is None
+
+
+###############################################################################
+# Verify that we can generate an output that is byte-identical to the expected golden file.
+
+
+@pytest.mark.parametrize(
+    "src_directory",
+    [
+        # Generated with:
+        # ogr2ogr autotest/ogr/data/shp/poly_golden autotest/ogr/data/poly.shp -lco DBF_DATE_LAST_UPDATE=2000-01-01
+        "data/shp/poly_golden",
+    ],
+)
+def test_ogr_shape_write_check_golden_file(tmp_path, src_directory):
+
+    out_directory = str(tmp_path / "test")
+    gdal.VectorTranslate(
+        out_directory,
+        src_directory,
+        format="ESRI Shapefile",
+        layerCreationOptions=["DBF_DATE_LAST_UPDATE=2000-01-01"],
+    )
+    for filename in os.listdir(src_directory):
+        src_filename = os.path.join(src_directory, filename)
+        out_filename = os.path.join(out_directory, filename)
+
+        assert os.stat(src_filename).st_size == os.stat(out_filename).st_size, filename
+        assert (
+            open(src_filename, "rb").read() == open(out_filename, "rb").read()
+        ), filename

From d10585d3133e36a38bee8a3dc569881133736b4f Mon Sep 17 00:00:00 2001
From: Even Rouault <even.rouault@spatialys.com>
Date: Fri, 1 Nov 2024 18:53:01 +0100
Subject: [PATCH 07/62] test: JP2OpenJPEG: verify that we can generate an
 output that is byte-identical to the expected golden file.

---
 .../jpeg2000/byte_lossless_openjp2_golden.jp2 | Bin 0 -> 2798 bytes
 autotest/gdrivers/jp2openjpeg.py              |  26 ++++++++++++++++++
 2 files changed, 26 insertions(+)
 create mode 100644 autotest/gdrivers/data/jpeg2000/byte_lossless_openjp2_golden.jp2

diff --git a/autotest/gdrivers/data/jpeg2000/byte_lossless_openjp2_golden.jp2 b/autotest/gdrivers/data/jpeg2000/byte_lossless_openjp2_golden.jp2
new file mode 100644
index 0000000000000000000000000000000000000000..049871c52ec827909d5438923cbed23f9dc19f1c
GIT binary patch
literal 2798
zcmb7GU2GIp6u#3gEi{E%p;c;w!{Q%Uc6YYWx)Zux=pPkYmKGI>F-&*v?oOGVyWE-W
z57t<fP)bozVj3$US|gwjVEx5JpbzCiV~`lpNP;mT(Fa4~lhy>X-g{?uJF{ICn@sPS
zbG~!Vch0$U_W}S)d&4}xxMZXR0C0EI7)<si{XCHYU3-%QRIb1n?&YS3K>=_LgQgHs
z0kD<;;$*E<;xZ=dGWKxA#bgDbEP_<bF|ix~&Pb&cd1}k-xrGf|n%+Hq_=hVeE)}-7
zuK_eI080p@5KuUKY;u7mFC?}?o4&{<mvb)WM==#PeWgvl%fit3)i(W-7fV4Q(Jdo}
z7xtHeA|lri`OHWm;E8;eJX0eDpqi|+<e`5Y%SP&SnKB8@bOo-dkqT;GI5koPDnJn-
z#DSMrfl|6Y5qGh5^{UQ|&Hj4c%RjcGgWrcV$O}T}5K|7Z_yBZ<Od+#c>VH^5zf^6c
zrmv50WvdA*{y@@EgvjM;kIKgqsz;U#iM$t&TZXEP0V8CrRW#j@v<Re#)IdVzHxM7O
z6-q`5!zQGvFk&c3E8%&bST$WF<_34%Fp{Fz+uz^sK}o2^6y2ji!%J-LIplz@IFS9d
z9)x3Fzt1OlcXn*;j>KU?$`UXgbw$*fek+PdG_fN>PA=Y}VS4DfRJ^qw!6SH+7)2Eo
z>XhB1VSm_s^$H%J$LG{}26WjSBn7Y@^q`a`LwVC6)mXX~tq-b%1+QbhQ%b-FcM5AF
zibX^jM#Tu$#g=e)n<)D0>IH#?%DSqkAL1s2HHd@j*7@rFKEAeoy-)D_Hc%F#-OdtP
zDO1U$%})rJenM@=&u%w~vTzxutQ@9s)Wa%_np5t8i5@~~uF84SH*FT7JYLx%yXbC-
zDo}32irj?yAeLf~mz4z6bT&-Co9;<bksI7yZB0J16|!B*2Eq0q%_t!$T803{STr1|
zoGF}W3$H2(#n5w2mXXr6cg)N)_n)<kOepB%=f*WZmK80o#Nt6AzzYF$wCvrQsVyd(
z%D-i{Hb!3JjFB{;ux%I|FrnG^oMcY$;B9~rR${qrr_@=bFj6!lJI64a;{W?1=DJ{o
zOd_3q73<koA@~DUo2fph7~;V?$5`YVO;djMj5~@(b!a?Jwiv><$+<2&bi4Ds%yeJd
z0k;|5$FHrkinV(?!<%I}w!9r+n*ZawVdY~<i@|Pa*uAY0w30*8skRKvCoExd-Wd?<
z-n<}s#L^QnNZLTI2;|2hf@G*07)lARRbLj9Gf~Ej8oLdgiqIdPASo4LnQzxfMiokZ
zP&R9Hq8xEdKx<AFR%_;!ZR#Bz&74=6`mBt34oSBE@}81;eN1H@Jo(Y(WYVojdRuxs
zSWPZknF|#WtUvRDm3iZeIWFA=7IXA&0bJw;^hWwA;0lG(GP##<=_BUVio27@N5A<j
z1omC-y6~%Kc;e=#WA|16BvjUZ*HZGOKRsCXQhJ&4+~zeqt~4H4KK9q5M9J@oaRnV~
zuX^RR>9bPH>6*#6xXaQ#9~(29+D0Gxc5lOi(9Sc-YfC4O-ooQMkB2`R9k1!z{ozF5
z@WrC5=elMB&F?*YqJtaJ-ua^Ozyqy^r;mL<IdguCe(K<bs$FgCp1Jh?)Sek}&+`XQ
zUHj_Ko3&34>&vgN3|uTIo;>^N$rXnC-q(lU{`t!4l~;SN@840D9x6IQYv!7Ld&bhw
zsvRqS+j#lubivg2^4*JecivG`y6wjBMC19*u;Kn&A)|A8Y1bh*F;utUlSgK69RI`h
Q`Djh~gD3S}Z>E3z8=(mJ#Q*>R

literal 0
HcmV?d00001

diff --git a/autotest/gdrivers/jp2openjpeg.py b/autotest/gdrivers/jp2openjpeg.py
index 387a15cc0756..1aa3439bba66 100755
--- a/autotest/gdrivers/jp2openjpeg.py
+++ b/autotest/gdrivers/jp2openjpeg.py
@@ -3920,3 +3920,29 @@ def test_jp2openjpeg_unsupported_srs_for_gmljp2(tmp_vsimem):
     assert ds.GetSpatialRef().IsSame(ref_srs)
     # Check that we do *not* have a GMLJP2 box
     assert "xml:gml.root-instance" not in ds.GetMetadataDomainList()
+
+
+###############################################################################
+# Verify that we can generate an output that is byte-identical to the expected golden file.
+# (might be risky depending on libopenjp2...)
+
+
+@pytest.mark.parametrize(
+    "src_filename,creation_options",
+    [
+        # Created with gdal_translate autotest/gcore/data/byte.tif autotest/gdrivers/data/jpeg2000/byte_lossless_openjp2_golden.jp2 -of jp2openjpeg -co QUALITY=100 -co REVERSIBLE=YES -co COMMENT=
+        (
+            "data/jpeg2000/byte_lossless_openjp2_golden.jp2",
+            ["QUALITY=100", "REVERSIBLE=YES", "COMMENT="],
+        ),
+    ],
+)
+def test_jp2openjpeg_write_check_golden_file(tmp_path, src_filename, creation_options):
+
+    out_filename = str(tmp_path / "test.jp2")
+    with gdal.Open(src_filename) as src_ds:
+        gdal.GetDriverByName("JP2OpenJPEG").CreateCopy(
+            out_filename, src_ds, options=creation_options
+        )
+    assert os.stat(src_filename).st_size == os.stat(out_filename).st_size
+    assert open(src_filename, "rb").read() == open(out_filename, "rb").read()

From 0d7c23dd27cd4f76ff17ed88703d36a1783cecb0 Mon Sep 17 00:00:00 2001
From: Even Rouault <even.rouault@spatialys.com>
Date: Fri, 1 Nov 2024 19:15:15 +0100
Subject: [PATCH 08/62] test: netCDF: verify that we can generate an output
 that is byte-identical to the expected golden file.

---
 .../gdrivers/data/netcdf/byte_nc3_golden.nc   | Bin 0 -> 3200 bytes
 autotest/gdrivers/netcdf.py                   |  33 ++++++++++++++++++
 2 files changed, 33 insertions(+)
 create mode 100644 autotest/gdrivers/data/netcdf/byte_nc3_golden.nc

diff --git a/autotest/gdrivers/data/netcdf/byte_nc3_golden.nc b/autotest/gdrivers/data/netcdf/byte_nc3_golden.nc
new file mode 100644
index 0000000000000000000000000000000000000000..09b43e1a71fbef532c02c2bdf0d00e59cf15741f
GIT binary patch
literal 3200
zcmeHJO>7%Q6n08mNYfTkkbt0~)pCG?6TG%#drx$6+{9I5M~*F(MAd2;uifq1yV1_3
zP1y^Gk?3I$Anl<=TmB?VJz&8lq8y@fLE=KsT)0A^K+A6{fmGhx*(6TnN+2Xo9c%K<
z?0hrtn>TOrrZ|0cB*Du*fL%~WR(bz1Y@gd|?}sLiADWp~j;l(oq^Q-JI$teU>JbUY
zb}Npq0ge@I>x^!>hHX(C#Ie1_qZ88P6m}D1P<KOp)O9qAp3xnut6kk8nrk~ajs@)<
zfci+=F`8;uTV6J-wrXiz9Y^_^_nLpWV_R**U1{p7-BJlCakLIvFq(#D@%eiJMm{@v
zg!jN!OY49PU861-8sI2TvsYu%LC&#k$33+{iq?t85e|1WchmH;W1rRu`a!iFqiqb@
z9+)FgqIGn&r4ev|s=31?M|NJg`PZk<d=TOhjp816_~T%)wjeb1mSGuaGqUj1HeW0|
zMmO^HrYCmlf0*iB1N?m&m8=@k`9&P%ME#SC7cagIs~$4oc=6C$uvu5Pw)x_SV_&|y
z(s%=S4-$u}XH6G{Kn(0!p08Dp6&IGoiZY$f3(pIU`ke5JZRvs}Rm9{>sXDVcDki6u
zdSec_=xtTO(<7RtPHXN;SC!;+UQ8~`&z5S{^7N8e>}ZY&%M@}sF`3Kc3+YsDGMh}x
z*-1HH$jODYlqtxCq|&I*R%_+@OG{#DeqlyTiuqKkAf7xqWF(i)<#A-bR-P+?kQqnU
zt+NI>B_>lt=mj|?A-z#4*O$bm-Ud1_CsR^BJC#Yx=_yH;vzg40sj`$#57&_0)KEQE
zD%Q)@%92>$x-@g~r4d1;Qd8zi^-^s~9E?C7cA#S$W9O9+lM|AZ-$oe>N!*4k%eifY
zZTCe?PNlF6VQk!%NCS6HS9cxojWo;^Degt(L>c0w#BZaREGY|hIHTxe6c7+{VzN@L
z)n|tt9e7Q`!+^LOqswq1;rcsz>wX;Gk3$F>h|>Q*4)@RKc64)Fw$tVR4-@0rY$~5l
z3AvPz5;FNIDFdC9FadoG)hKU5o*exU+8cR!BVS{Fd;sbJ>S|U~bDBGH!Q%tDXH_M(
z?KBNbbEE7zEO(5r3R@(>ZV6lS!Ac^rX9bc<lnuD95!?h6%nk2>#*YCv-V3M^>%Y->
z?w@Ep_n$QGn+HvMUIceZ(13#Pl0kpsn`D@8qBoC#6NSo3_hsD?_yVJ9!=gsp(wn$i
zd|s}zf=(Vgqjlin0{^Sqk<an0(B56B`v>1N@%MYZWkBD^n~B7}Tfj>w<KJB2`0Ztm
z*DrAV?mWluqvxA4e&ZDH|8RoikFy+qDsa5{9LHNvaQyk981HcW<vPb-zvB4Y7aV`T
z!tsyG9B*IXc;|eC$HUiSd_TskF@7E6pByhe8Dl=iMvPI2DaS9p756`i@yi%*aC~Pp
z#%E(J#Ykd&Eyj0aTt{463xhBW*H}QAiG>ogfBWt=X8K;hdcDABo(aEj&F_1RQ12Ke
zo*%NF@00bu@AoKMBy^DwlQ3d3)(b)pSMdpjZ<NtRkNT#+Xi{^Lc$8Al3ur)Cz?kU~
zATi?8z$45H{C+?@Fh$JuzDcP`e6#O+@CE)*x@h)M9uO&G3@4ZY$N^Pm#PJE3^h^eZ
zsE3+?1m-;kbPtsiXi_sk^PcaaG}x<0)`<CeKk$8qEBg$A^*y@Y?+3^<VSWh08M|h(
zkO2Yr5O_Tg+{Hrh5r{)m@H1%u8;6B~7VVmX4xE6RV7meBfiFDN4qG50XoRV-Bm>##
I4;lu41IpKU%K!iX

literal 0
HcmV?d00001

diff --git a/autotest/gdrivers/netcdf.py b/autotest/gdrivers/netcdf.py
index 21c0b3026ca6..6622974f49b4 100755
--- a/autotest/gdrivers/netcdf.py
+++ b/autotest/gdrivers/netcdf.py
@@ -6583,3 +6583,36 @@ def test_netcdf_extra_dim_no_georef(tmp_path):
     ds = gdal.Open(fname)
     assert ds.RasterCount == 4
     assert ds.ReadRaster() == src_ds.ReadRaster()
+
+
+###############################################################################
+# Verify that we can generate an output that is byte-identical to the expected golden file.
+# (might be risky depending on libopenjp2...)
+
+
+@pytest.mark.parametrize(
+    "src_filename,golden_file,creation_options",
+    [
+        # Created with gdal_translate gdal_translate autotest/gcore/data/byte.tif autotest/gdrivers/data/netcdf/byte_nc3_golden.nc  -co WRITE_GDAL_VERSION=NO  -co WRITE_GDAL_HISTORY=NO -co FORMAT=NC
+        (
+            "../gcore/data/byte.tif",
+            "data/netcdf/byte_nc3_golden.nc",
+            ["WRITE_GDAL_VERSION=NO", "WRITE_GDAL_HISTORY=NO", "FORMAT=NC"],
+        ),
+    ],
+)
+# I've that feeling that netCDF might be host endianness dependent...
+@pytest.mark.skipif(
+    sys.byteorder != "little", reason="only supported on little-endian hosts"
+)
+def test_netcdf_write_check_golden_file(
+    tmp_path, src_filename, golden_file, creation_options
+):
+
+    out_filename = str(tmp_path / "test.nc")
+    with gdal.Open(src_filename) as src_ds:
+        gdal.GetDriverByName("netCDF").CreateCopy(
+            out_filename, src_ds, options=creation_options
+        )
+    assert os.stat(golden_file).st_size == os.stat(out_filename).st_size
+    assert open(golden_file, "rb").read() == open(out_filename, "rb").read()

From 3e8bf03a35ba65ae3e03ad3495dc1fc709b4637e Mon Sep 17 00:00:00 2001
From: Even Rouault <even.rouault@spatialys.com>
Date: Sat, 2 Nov 2024 03:03:17 +0100
Subject: [PATCH 09/62] scripts/cppcheck.sh: remove obsolete lines about SDE

---
 scripts/cppcheck.sh | 4 ----
 1 file changed, 4 deletions(-)

diff --git a/scripts/cppcheck.sh b/scripts/cppcheck.sh
index ef7eb728a714..c1d1ce89f561 100755
--- a/scripts/cppcheck.sh
+++ b/scripts/cppcheck.sh
@@ -146,10 +146,6 @@ ret_code=0
 grep -v "unmatchedSuppression" ${LOG_FILE} | grep -v -e " yacc.c" -e PublicDecompWT -e "kdu_cache_wrapper.h" > ${LOG_FILE}.tmp
 mv ${LOG_FILE}.tmp ${LOG_FILE}
 
-# I don't want to care about SDE
-grep -v -e "frmts/sde" -e  "ogr/ogrsf_frmts/sde" ${LOG_FILE} > ${LOG_FILE}.tmp
-mv ${LOG_FILE}.tmp ${LOG_FILE}
-
 # I don't want to care about flatbuffers
 grep -v -e "ogr/ogrsf_frmts/flatgeobuf/flatbuffers" ${LOG_FILE} > ${LOG_FILE}.tmp
 mv ${LOG_FILE}.tmp ${LOG_FILE}

From 062bbee3b35d22b5c4fe937324de55f5bb9ef283 Mon Sep 17 00:00:00 2001
From: Even Rouault <even.rouault@spatialys.com>
Date: Sat, 2 Nov 2024 02:22:04 +0100
Subject: [PATCH 10/62] OGRGeometry classes: implement move constructor and
 move assignment operator

as suggested by performance warnings of Coverity Scan
---
 autotest/cpp/test_ogr.cpp     | 54 ++++++++++++++++++++
 ogr/ogr_geometry.h            | 75 ++++++++++++++++++++++++++-
 ogr/ogrcurvecollection.cpp    | 38 ++++++++++++++
 ogr/ogrgeometry.cpp           | 37 ++++++++++++++
 ogr/ogrgeometrycollection.cpp | 45 +++++++++++++++++
 ogr/ogrlinestring.cpp         | 95 ++++++++++++++++++++++++++++++++++-
 scripts/cppcheck.sh           |  6 +++
 7 files changed, 347 insertions(+), 3 deletions(-)

diff --git a/autotest/cpp/test_ogr.cpp b/autotest/cpp/test_ogr.cpp
index 9f2de42a3150..b4f96c1d7233 100644
--- a/autotest/cpp/test_ogr.cpp
+++ b/autotest/cpp/test_ogr.cpp
@@ -368,6 +368,60 @@ TEST_F(test_ogr, OGRCurvePolygon_copy_constructor_illegal_use)
     EXPECT_TRUE(poly.IsEmpty());
 }
 
+template <class T> void testMove()
+{
+    auto poSRS = new OGRSpatialReference();
+    {
+        auto poOrigin = std::unique_ptr<T>(make<T>());
+        ASSERT_TRUE(nullptr != poOrigin);
+        poOrigin->assignSpatialReference(poSRS);
+
+        T valueCopy(*poOrigin);
+        const int refCountBefore = poSRS->GetReferenceCount();
+        T fromMoved(std::move(*poOrigin));
+        EXPECT_EQ(poSRS->GetReferenceCount(), refCountBefore);
+
+        ASSERT_TRUE(CPL_TO_BOOL(fromMoved.Equals(&valueCopy)))
+            << valueCopy.getGeometryName()
+            << ": move constructor changed a value";
+        EXPECT_EQ(fromMoved.getSpatialReference(), poSRS);
+
+        T valueCopy2(valueCopy);
+        EXPECT_EQ(valueCopy.getSpatialReference(), poSRS);
+        T value3;
+        const int refCountBefore2 = poSRS->GetReferenceCount();
+        value3 = std::move(valueCopy);
+        EXPECT_EQ(poSRS->GetReferenceCount(), refCountBefore2);
+
+        ASSERT_TRUE(CPL_TO_BOOL(value3.Equals(&valueCopy2)))
+            << valueCopy2.getGeometryName()
+            << ": move assignment operator changed a value";
+        EXPECT_EQ(value3.getSpatialReference(), poSRS);
+    }
+    EXPECT_EQ(poSRS->GetReferenceCount(), 1);
+    poSRS->Release();
+}
+
+TEST_F(test_ogr, geometry_move)
+{
+    testMove<OGRPoint>();
+    testMove<OGRLineString>();
+    testMove<OGRLinearRing>();
+    testMove<OGRCircularString>();
+    testMove<OGRCompoundCurve>();
+    testMove<OGRCurvePolygon>();
+    testMove<OGRPolygon>();
+    testMove<OGRGeometryCollection>();
+    testMove<OGRMultiSurface>();
+    testMove<OGRMultiPolygon>();
+    testMove<OGRMultiPoint>();
+    testMove<OGRMultiCurve>();
+    testMove<OGRMultiLineString>();
+    testMove<OGRTriangle>();
+    testMove<OGRPolyhedralSurface>();
+    testMove<OGRTriangulatedSurface>();
+}
+
 TEST_F(test_ogr, geometry_get_point)
 {
     {
diff --git a/ogr/ogr_geometry.h b/ogr/ogr_geometry.h
index abd66ace0e8f..5bc1134e7c25 100644
--- a/ogr/ogr_geometry.h
+++ b/ogr/ogr_geometry.h
@@ -407,9 +407,11 @@ class CPL_DLL OGRGeometry
 
     OGRGeometry();
     OGRGeometry(const OGRGeometry &other);
+    OGRGeometry(OGRGeometry &&other);
     virtual ~OGRGeometry();
 
     OGRGeometry &operator=(const OGRGeometry &other);
+    OGRGeometry &operator=(OGRGeometry &&other);
 
     /** Returns if two geometries are equal. */
     bool operator==(const OGRGeometry &other) const
@@ -1144,9 +1146,13 @@ class CPL_DLL OGRPoint : public OGRGeometry
     OGRPoint(double x, double y, double z);
     OGRPoint(double x, double y, double z, double m);
     OGRPoint(const OGRPoint &other);
+    /** Move constructor */
+    OGRPoint(OGRPoint &&other) = default;
     static OGRPoint *createXYM(double x, double y, double m);
 
     OGRPoint &operator=(const OGRPoint &other);
+    /** Move assignment operator */
+    OGRPoint &operator=(OGRPoint &&other) = default;
 
     // IWks Interface
     size_t WkbSize() const override;
@@ -1317,6 +1323,7 @@ class CPL_DLL OGRCurve : public OGRGeometry
     //! @cond Doxygen_Suppress
     OGRCurve() = default;
     OGRCurve(const OGRCurve &other) = default;
+    OGRCurve(OGRCurve &&other) = default;
 
     virtual OGRCurveCasterToLineString GetCasterToLineString() const = 0;
     virtual OGRCurveCasterToLinearRing GetCasterToLinearRing() const = 0;
@@ -1350,6 +1357,7 @@ class CPL_DLL OGRCurve : public OGRGeometry
   public:
     //! @cond Doxygen_Suppress
     OGRCurve &operator=(const OGRCurve &other);
+    OGRCurve &operator=(OGRCurve &&other) = default;
     //! @endcond
 
     /** Type of child elements. */
@@ -1532,6 +1540,8 @@ class CPL_DLL OGRSimpleCurve : public OGRCurve
 
     OGRSimpleCurve(const OGRSimpleCurve &other);
 
+    OGRSimpleCurve(OGRSimpleCurve &&other);
+
   private:
     class CPL_DLL Iterator
     {
@@ -1576,6 +1586,8 @@ class CPL_DLL OGRSimpleCurve : public OGRCurve
 
     OGRSimpleCurve &operator=(const OGRSimpleCurve &other);
 
+    OGRSimpleCurve &operator=(OGRSimpleCurve &&other);
+
     /** Type of child elements. */
     typedef OGRPoint ChildType;
 
@@ -1776,8 +1788,10 @@ class CPL_DLL OGRLineString : public OGRSimpleCurve
     /** Create an empty line string. */
     OGRLineString() = default;
     OGRLineString(const OGRLineString &other);
+    OGRLineString(OGRLineString &&other);
 
     OGRLineString &operator=(const OGRLineString &other);
+    OGRLineString &operator=(OGRLineString &&other);
 
     virtual OGRLineString *clone() const override;
     virtual OGRLineString *
@@ -1882,9 +1896,13 @@ class CPL_DLL OGRLinearRing : public OGRLineString
     /** Constructor */
     OGRLinearRing() = default;
     OGRLinearRing(const OGRLinearRing &other);
+    /** Move constructor*/
+    OGRLinearRing(OGRLinearRing &&other) = default;
     explicit OGRLinearRing(const OGRLinearRing *);
 
     OGRLinearRing &operator=(const OGRLinearRing &other);
+    /** Move assignment operator */
+    OGRLinearRing &operator=(OGRLinearRing &&other) = default;
 
     // Non standard.
     virtual const char *getGeometryName() const override;
@@ -1965,8 +1983,12 @@ class CPL_DLL OGRCircularString : public OGRSimpleCurve
     OGRCircularString() = default;
 
     OGRCircularString(const OGRCircularString &other);
+    /** Move constructor */
+    OGRCircularString(OGRCircularString &&other) = default;
 
     OGRCircularString &operator=(const OGRCircularString &other);
+    /** Move assignment operator */
+    OGRCircularString &operator=(OGRCircularString &&other) = default;
 
     // IWks Interface.
     virtual OGRErr importFromWkb(const unsigned char *, size_t, OGRwkbVariant,
@@ -2074,9 +2096,11 @@ class CPL_DLL OGRCurveCollection
   public:
     OGRCurveCollection() = default;
     OGRCurveCollection(const OGRCurveCollection &other);
+    OGRCurveCollection(OGRCurveCollection &&other);
     ~OGRCurveCollection();
 
     OGRCurveCollection &operator=(const OGRCurveCollection &other);
+    OGRCurveCollection &operator=(OGRCurveCollection &&other);
 
     /** Type of child elements. */
     typedef OGRCurve ChildType;
@@ -2207,8 +2231,12 @@ class CPL_DLL OGRCompoundCurve : public OGRCurve
     OGRCompoundCurve() = default;
 
     OGRCompoundCurve(const OGRCompoundCurve &other);
+    /** Move constructor */
+    OGRCompoundCurve(OGRCompoundCurve &&other) = default;
 
     OGRCompoundCurve &operator=(const OGRCompoundCurve &other);
+    /** Move assignment operator */
+    OGRCompoundCurve &operator=(OGRCompoundCurve &&other) = default;
 
     /** Type of child elements. */
     typedef OGRCurve ChildType;
@@ -2476,8 +2504,12 @@ class CPL_DLL OGRCurvePolygon : public OGRSurface
     OGRCurvePolygon() = default;
 
     OGRCurvePolygon(const OGRCurvePolygon &);
+    /** Move constructor */
+    OGRCurvePolygon(OGRCurvePolygon &&) = default;
 
     OGRCurvePolygon &operator=(const OGRCurvePolygon &other);
+    /** Move assignment operator */
+    OGRCurvePolygon &operator=(OGRCurvePolygon &&other) = default;
 
     /** Type of child elements. */
     typedef OGRCurve ChildType;
@@ -2685,8 +2717,12 @@ class CPL_DLL OGRPolygon : public OGRCurvePolygon
     OGRPolygon() = default;
 
     OGRPolygon(const OGRPolygon &other);
+    /** Move constructor */
+    OGRPolygon(OGRPolygon &&other) = default;
 
     OGRPolygon &operator=(const OGRPolygon &other);
+    /** Move assignment operator */
+    OGRPolygon &operator=(OGRPolygon &&other) = default;
 
     /** Type of child elements. */
     typedef OGRLinearRing ChildType;
@@ -2856,8 +2892,12 @@ class CPL_DLL OGRTriangle : public OGRPolygon
     OGRTriangle() = default;
     OGRTriangle(const OGRPoint &p, const OGRPoint &q, const OGRPoint &r);
     OGRTriangle(const OGRTriangle &other);
+    /** Move constructor */
+    OGRTriangle(OGRTriangle &&other) = default;
     OGRTriangle(const OGRPolygon &other, OGRErr &eErr);
     OGRTriangle &operator=(const OGRTriangle &other);
+    /** Move assignment operator */
+    OGRTriangle &operator=(OGRTriangle &&other) = default;
 
     virtual const char *getGeometryName() const override;
     virtual OGRwkbGeometryType getGeometryType() const override;
@@ -2938,9 +2978,11 @@ class CPL_DLL OGRGeometryCollection : public OGRGeometry
     OGRGeometryCollection() = default;
 
     OGRGeometryCollection(const OGRGeometryCollection &other);
+    OGRGeometryCollection(OGRGeometryCollection &&other);
     ~OGRGeometryCollection() override;
 
     OGRGeometryCollection &operator=(const OGRGeometryCollection &other);
+    OGRGeometryCollection &operator=(OGRGeometryCollection &&other);
 
     /** Type of child elements. */
     typedef OGRGeometry ChildType;
@@ -3122,8 +3164,12 @@ class CPL_DLL OGRMultiSurface : public OGRGeometryCollection
     OGRMultiSurface() = default;
 
     OGRMultiSurface(const OGRMultiSurface &other);
+    /** Move constructor */
+    OGRMultiSurface(OGRMultiSurface &&other) = default;
 
     OGRMultiSurface &operator=(const OGRMultiSurface &other);
+    /** Move assignment operator */
+    OGRMultiSurface &operator=(OGRMultiSurface &&other) = default;
 
     /** Type of child elements. */
     typedef OGRSurface ChildType;
@@ -3290,8 +3336,12 @@ class CPL_DLL OGRMultiPolygon : public OGRMultiSurface
     OGRMultiPolygon() = default;
 
     OGRMultiPolygon(const OGRMultiPolygon &other);
+    /** Move constructor */
+    OGRMultiPolygon(OGRMultiPolygon &&other) = default;
 
     OGRMultiPolygon &operator=(const OGRMultiPolygon &other);
+    /** Move assignment operator */
+    OGRMultiPolygon &operator=(OGRMultiPolygon &&other) = default;
 
     /** Type of child elements. */
     typedef OGRPolygon ChildType;
@@ -3452,9 +3502,13 @@ class CPL_DLL OGRPolyhedralSurface : public OGRSurface
     /** Create an empty PolyhedralSurface */
     OGRPolyhedralSurface() = default;
 
-    OGRPolyhedralSurface(const OGRPolyhedralSurface &poGeom);
+    OGRPolyhedralSurface(const OGRPolyhedralSurface &other);
+    /** Move constructor */
+    OGRPolyhedralSurface(OGRPolyhedralSurface &&other) = default;
 
     OGRPolyhedralSurface &operator=(const OGRPolyhedralSurface &other);
+    /** Move assignment operator */
+    OGRPolyhedralSurface &operator=(OGRPolyhedralSurface &&other) = default;
 
     /** Type of child elements. */
     typedef OGRPolygon ChildType;
@@ -3629,6 +3683,12 @@ class CPL_DLL OGRTriangulatedSurface : public OGRPolyhedralSurface
     OGRTriangulatedSurface() = default;
 
     OGRTriangulatedSurface(const OGRTriangulatedSurface &other);
+    /** Move constructor */
+    OGRTriangulatedSurface(OGRTriangulatedSurface &&other) = default;
+
+    OGRTriangulatedSurface &operator=(const OGRTriangulatedSurface &other);
+    /** Move assignment operator */
+    OGRTriangulatedSurface &operator=(OGRTriangulatedSurface &&other) = default;
 
     /** Type of child elements. */
     typedef OGRTriangle ChildType;
@@ -3661,7 +3721,6 @@ class CPL_DLL OGRTriangulatedSurface : public OGRPolyhedralSurface
         return reinterpret_cast<const ChildType *const *>(oMP.end());
     }
 
-    OGRTriangulatedSurface &operator=(const OGRTriangulatedSurface &other);
     virtual const char *getGeometryName() const override;
     virtual OGRwkbGeometryType getGeometryType() const override;
     virtual OGRTriangulatedSurface *clone() const override;
@@ -3764,8 +3823,12 @@ class CPL_DLL OGRMultiPoint : public OGRGeometryCollection
     OGRMultiPoint() = default;
 
     OGRMultiPoint(const OGRMultiPoint &other);
+    /** Move constructor */
+    OGRMultiPoint(OGRMultiPoint &&other) = default;
 
     OGRMultiPoint &operator=(const OGRMultiPoint &other);
+    /** Move assignment operator */
+    OGRMultiPoint &operator=(OGRMultiPoint &&other) = default;
 
     /** Type of child elements. */
     typedef OGRPoint ChildType;
@@ -3922,8 +3985,12 @@ class CPL_DLL OGRMultiCurve : public OGRGeometryCollection
     OGRMultiCurve() = default;
 
     OGRMultiCurve(const OGRMultiCurve &other);
+    /** Move constructor */
+    OGRMultiCurve(OGRMultiCurve &&other) = default;
 
     OGRMultiCurve &operator=(const OGRMultiCurve &other);
+    /** Move assignment operator */
+    OGRMultiCurve &operator=(OGRMultiCurve &&other) = default;
 
     /** Type of child elements. */
     typedef OGRCurve ChildType;
@@ -4075,8 +4142,12 @@ class CPL_DLL OGRMultiLineString : public OGRMultiCurve
     OGRMultiLineString() = default;
 
     OGRMultiLineString(const OGRMultiLineString &other);
+    /** Move constructor */
+    OGRMultiLineString(OGRMultiLineString &&other) = default;
 
     OGRMultiLineString &operator=(const OGRMultiLineString &other);
+    /** Move assignment operator */
+    OGRMultiLineString &operator=(OGRMultiLineString &&other) = default;
 
     /** Type of child elements. */
     typedef OGRLineString ChildType;
diff --git a/ogr/ogrcurvecollection.cpp b/ogr/ogrcurvecollection.cpp
index 83f3ae2e063a..38132dca41e7 100644
--- a/ogr/ogrcurvecollection.cpp
+++ b/ogr/ogrcurvecollection.cpp
@@ -59,6 +59,23 @@ OGRCurveCollection::OGRCurveCollection(const OGRCurveCollection &other)
     }
 }
 
+/************************************************************************/
+/*             OGRCurveCollection( OGRCurveCollection&& )               */
+/************************************************************************/
+
+/**
+ * \brief Move constructor.
+ *
+ * @since GDAL 3.11
+ */
+
+OGRCurveCollection::OGRCurveCollection(OGRCurveCollection &&other)
+    : nCurveCount(other.nCurveCount), papoCurves(other.papoCurves)
+{
+    other.nCurveCount = 0;
+    other.papoCurves = nullptr;
+}
+
 /************************************************************************/
 /*                         ~OGRCurveCollection()                        */
 /************************************************************************/
@@ -107,6 +124,27 @@ OGRCurveCollection::operator=(const OGRCurveCollection &other)
     return *this;
 }
 
+/************************************************************************/
+/*                    operator=( OGRCurveCollection&& )                 */
+/************************************************************************/
+
+/**
+ * \brief Move assignment operator.
+ *
+ * @since GDAL 3.11
+ */
+
+OGRCurveCollection &OGRCurveCollection::operator=(OGRCurveCollection &&other)
+{
+    if (this != &other)
+    {
+        empty(nullptr);
+        std::swap(nCurveCount, other.nCurveCount);
+        std::swap(papoCurves, other.papoCurves);
+    }
+    return *this;
+}
+
 /************************************************************************/
 /*                              WkbSize()                               */
 /************************************************************************/
diff --git a/ogr/ogrgeometry.cpp b/ogr/ogrgeometry.cpp
index e18e1b467526..e7a1cb4c5025 100644
--- a/ogr/ogrgeometry.cpp
+++ b/ogr/ogrgeometry.cpp
@@ -109,6 +109,22 @@ OGRGeometry::OGRGeometry(const OGRGeometry &other)
         const_cast<OGRSpatialReference *>(poSRS)->Reference();
 }
 
+/************************************************************************/
+/*                   OGRGeometry( OGRGeometry&& )                       */
+/************************************************************************/
+
+/**
+ * \brief Move constructor.
+ *
+ * @since GDAL 3.11
+ */
+
+OGRGeometry::OGRGeometry(OGRGeometry &&other)
+    : poSRS(other.poSRS), flags(other.flags)
+{
+    other.poSRS = nullptr;
+}
+
 /************************************************************************/
 /*                            ~OGRGeometry()                            */
 /************************************************************************/
@@ -144,6 +160,27 @@ OGRGeometry &OGRGeometry::operator=(const OGRGeometry &other)
     return *this;
 }
 
+/************************************************************************/
+/*                    operator=( OGRGeometry&&)                         */
+/************************************************************************/
+
+/**
+ * \brief Move assignment operator.
+ *
+ * @since GDAL 3.11
+ */
+
+OGRGeometry &OGRGeometry::operator=(OGRGeometry &&other)
+{
+    if (this != &other)
+    {
+        poSRS = other.poSRS;
+        other.poSRS = nullptr;
+        flags = other.flags;
+    }
+    return *this;
+}
+
 /************************************************************************/
 /*                            dumpReadable()                            */
 /************************************************************************/
diff --git a/ogr/ogrgeometrycollection.cpp b/ogr/ogrgeometrycollection.cpp
index 915dc7f0ca86..f1beb953ad96 100644
--- a/ogr/ogrgeometrycollection.cpp
+++ b/ogr/ogrgeometrycollection.cpp
@@ -57,6 +57,27 @@ OGRGeometryCollection::OGRGeometryCollection(const OGRGeometryCollection &other)
     }
 }
 
+/************************************************************************/
+/*            OGRGeometryCollection( OGRGeometryCollection&& )          */
+/************************************************************************/
+
+/**
+ * \brief Move constructor.
+ *
+ * @since GDAL 3.11
+ */
+
+// cppcheck-suppress-begin accessMoved
+OGRGeometryCollection::OGRGeometryCollection(OGRGeometryCollection &&other)
+    : OGRGeometry(std::move(other)), nGeomCount(other.nGeomCount),
+      papoGeoms(other.papoGeoms)
+{
+    other.nGeomCount = 0;
+    other.papoGeoms = nullptr;
+}
+
+// cppcheck-suppress-end accessMoved
+
 /************************************************************************/
 /*                       ~OGRGeometryCollection()                       */
 /************************************************************************/
@@ -112,6 +133,30 @@ OGRGeometryCollection::operator=(const OGRGeometryCollection &other)
     return *this;
 }
 
+/************************************************************************/
+/*                  operator=( OGRGeometryCollection&&)                 */
+/************************************************************************/
+
+/**
+ * \brief Move assignment operator.
+ *
+ * @since GDAL 3.11
+ */
+
+OGRGeometryCollection &
+OGRGeometryCollection::operator=(OGRGeometryCollection &&other)
+{
+    if (this != &other)
+    {
+        empty();
+
+        OGRGeometry::operator=(std::move(other));
+        std::swap(nGeomCount, other.nGeomCount);
+        std::swap(papoGeoms, other.papoGeoms);
+    }
+    return *this;
+}
+
 /************************************************************************/
 /*                               empty()                                */
 /************************************************************************/
diff --git a/ogr/ogrlinestring.cpp b/ogr/ogrlinestring.cpp
index 80445fea3b6d..929b570901e0 100644
--- a/ogr/ogrlinestring.cpp
+++ b/ogr/ogrlinestring.cpp
@@ -60,6 +60,31 @@ OGRSimpleCurve::OGRSimpleCurve(const OGRSimpleCurve &other)
         setPoints(other.nPointCount, other.paoPoints, other.padfZ, other.padfM);
 }
 
+/************************************************************************/
+/*                OGRSimpleCurve( OGRSimpleCurve&& )                    */
+/************************************************************************/
+
+/**
+ * \brief Move constructor.
+ *
+ * @since GDAL 3.11
+ */
+
+// cppcheck-suppress-begin accessMoved
+OGRSimpleCurve::OGRSimpleCurve(OGRSimpleCurve &&other)
+    : OGRCurve(std::move(other)), nPointCount(other.nPointCount),
+      m_nPointCapacity(other.m_nPointCapacity), paoPoints(other.paoPoints),
+      padfZ(other.padfZ), padfM(other.padfM)
+{
+    other.nPointCount = 0;
+    other.m_nPointCapacity = 0;
+    other.paoPoints = nullptr;
+    other.padfZ = nullptr;
+    other.padfM = nullptr;
+}
+
+// cppcheck-suppress-end accessMoved
+
 /************************************************************************/
 /*                          ~OGRSimpleCurve()                           */
 /************************************************************************/
@@ -73,7 +98,7 @@ OGRSimpleCurve::~OGRSimpleCurve()
 }
 
 /************************************************************************/
-/*                       operator=( const OGRPoint& )                   */
+/*                 operator=(const OGRSimpleCurve &other)               */
 /************************************************************************/
 
 /**
@@ -98,6 +123,43 @@ OGRSimpleCurve &OGRSimpleCurve::operator=(const OGRSimpleCurve &other)
     return *this;
 }
 
+/************************************************************************/
+/*                     operator=(OGRSimpleCurve &&other)                */
+/************************************************************************/
+
+/**
+ * \brief Move assignment operator.
+ *
+ * @since GDAL 3.11
+ */
+
+OGRSimpleCurve &OGRSimpleCurve::operator=(OGRSimpleCurve &&other)
+{
+    if (this != &other)
+    {
+        // cppcheck-suppress-begin accessMoved
+        OGRCurve::operator=(std::move(other));
+
+        nPointCount = other.nPointCount;
+        m_nPointCapacity = other.m_nPointCapacity;
+        CPLFree(paoPoints);
+        paoPoints = other.paoPoints;
+        CPLFree(padfZ);
+        padfZ = other.padfZ;
+        CPLFree(padfM);
+        padfM = other.padfM;
+        flags = other.flags;
+        other.nPointCount = 0;
+        other.m_nPointCapacity = 0;
+        other.paoPoints = nullptr;
+        other.padfZ = nullptr;
+        other.padfM = nullptr;
+        // cppcheck-suppress-end accessMoved
+    }
+
+    return *this;
+}
+
 /************************************************************************/
 /*                            flattenTo2D()                             */
 /************************************************************************/
@@ -2806,6 +2868,18 @@ OGRPointIterator *OGRSimpleCurve::getPointIterator() const
 
 OGRLineString::OGRLineString(const OGRLineString &) = default;
 
+/************************************************************************/
+/*                  OGRLineString( OGRLineString&& )                    */
+/************************************************************************/
+
+/**
+ * \brief Move constructor.
+ *
+ * @since GDAL 3.11
+ */
+
+OGRLineString::OGRLineString(OGRLineString &&) = default;
+
 /************************************************************************/
 /*                    operator=( const OGRLineString& )                 */
 /************************************************************************/
@@ -2828,6 +2902,25 @@ OGRLineString &OGRLineString::operator=(const OGRLineString &other)
     return *this;
 }
 
+/************************************************************************/
+/*                    operator=( OGRLineString&& )                      */
+/************************************************************************/
+
+/**
+ * \brief Move assignment operator.
+ *
+ * @since GDAL 3.11
+ */
+
+OGRLineString &OGRLineString::operator=(OGRLineString &&other)
+{
+    if (this != &other)
+    {
+        OGRSimpleCurve::operator=(std::move(other));
+    }
+    return *this;
+}
+
 /************************************************************************/
 /*                          getGeometryType()                           */
 /************************************************************************/
diff --git a/scripts/cppcheck.sh b/scripts/cppcheck.sh
index c1d1ce89f561..f3b9932c2077 100755
--- a/scripts/cppcheck.sh
+++ b/scripts/cppcheck.sh
@@ -180,6 +180,12 @@ mv ${LOG_FILE}.tmp ${LOG_FILE}
 grep -v -e "The comparison '0 <= yystate' is always true" ${LOG_FILE} > ${LOG_FILE}.tmp
 mv ${LOG_FILE}.tmp ${LOG_FILE}
 
+# False positives with cppcheck of ubuntu 20.04
+grep -v -e "ogrlinestring.cpp:.*warning,accessMoved"  ${LOG_FILE} > ${LOG_FILE}.tmp
+mv ${LOG_FILE}.tmp ${LOG_FILE}
+grep -v -e "ogrgeometrycollection.cpp:.*warning,accessMoved"  ${LOG_FILE} > ${LOG_FILE}.tmp
+mv ${LOG_FILE}.tmp ${LOG_FILE}
+
 if grep "null pointer" ${LOG_FILE} ; then
     echo "Null pointer check failed"
     ret_code=1

From f23b0598fb0cc298177db3330aae57406887fd55 Mon Sep 17 00:00:00 2001
From: Even Rouault <even.rouault@spatialys.com>
Date: Sat, 2 Nov 2024 18:56:10 +0100
Subject: [PATCH 11/62] Add gdal_minmax_element.hpp public header, that can
 also be vendored, to find the min/max elements in a buffer

Refs https://github.com/qgis/QGIS/pull/59285

NaN values are taken into account for float/double
Contains a SSE2 optimized version for int8/uint8/int16/uint16/int32/uint32/float/double in the no-nodata case (also taking into account NaN)
---
 autotest/cpp/CMakeLists.txt                |   1 +
 autotest/cpp/test_gdal_minmax_element.cpp  | 798 +++++++++++++++++
 gcore/CMakeLists.txt                       |   1 +
 gcore/gdal_minmax_element.hpp              | 971 +++++++++++++++++++++
 perftests/CMakeLists.txt                   |   4 +
 perftests/testperf_gdal_minmax_element.cpp | 534 +++++++++++
 6 files changed, 2309 insertions(+)
 create mode 100644 autotest/cpp/test_gdal_minmax_element.cpp
 create mode 100644 gcore/gdal_minmax_element.hpp
 create mode 100644 perftests/testperf_gdal_minmax_element.cpp

diff --git a/autotest/cpp/CMakeLists.txt b/autotest/cpp/CMakeLists.txt
index 493f1d59f9bd..d8420177a9ff 100644
--- a/autotest/cpp/CMakeLists.txt
+++ b/autotest/cpp/CMakeLists.txt
@@ -77,6 +77,7 @@ add_executable(
   test_gdal_aaigrid.cpp
   test_gdal_dted.cpp
   test_gdal_gtiff.cpp
+  test_gdal_minmax_element.cpp
   test_gdal_pixelfn.cpp
   test_gdal_typetraits.cpp
   test_ogr.cpp
diff --git a/autotest/cpp/test_gdal_minmax_element.cpp b/autotest/cpp/test_gdal_minmax_element.cpp
new file mode 100644
index 000000000000..0d3216ca9b50
--- /dev/null
+++ b/autotest/cpp/test_gdal_minmax_element.cpp
@@ -0,0 +1,798 @@
+///////////////////////////////////////////////////////////////////////////////
+//
+// Project:  C++ Test Suite for GDAL/OGR
+// Purpose:  Test gdal_minmax_element.hpp
+// Author:   Even Rouault <even.rouault at spatialys.com>
+//
+///////////////////////////////////////////////////////////////////////////////
+// Copyright (c) 2023, Even Rouault <even.rouault at spatialys.com>
+/*
+ * SPDX-License-Identifier: MIT
+ ****************************************************************************/
+
+#include "gdal_unit_test.h"
+
+#include "gdal_minmax_element.hpp"
+
+#include "gtest_include.h"
+
+#include <limits>
+
+namespace
+{
+
+struct test_gdal_minmax_element : public ::testing::Test
+{
+};
+
+TEST_F(test_gdal_minmax_element, uint8)
+{
+    using T = uint8_t;
+    constexpr GDALDataType eDT = GDT_Byte;
+    T min_v = 3;
+    T max_v = 7;
+    {
+        T nodata = 0;
+        std::vector<T> v{max_v, nodata, min_v};
+        {
+            auto idx_min = gdal::min_element(v.data(), 0, eDT, true, nodata);
+            EXPECT_EQ(idx_min, 0);
+        }
+        {
+            auto idx_min = gdal::min_element(v.data(), 0, eDT, false, 0);
+            EXPECT_EQ(idx_min, 0);
+        }
+        {
+            auto idx_min =
+                gdal::min_element(v.data(), v.size(), eDT, true, nodata);
+            EXPECT_EQ(v[idx_min], min_v);
+            auto idx_max =
+                gdal::max_element(v.data(), v.size(), eDT, true, nodata);
+            EXPECT_EQ(v[idx_max], max_v);
+        }
+        {
+            auto [idx_min, idx_max] =
+                gdal::minmax_element(v.data(), v.size(), eDT, true, nodata);
+            EXPECT_EQ(v[idx_min], min_v);
+            EXPECT_EQ(v[idx_max], max_v);
+        }
+    }
+    {
+        T nodata = 0;
+        std::vector<T> v{nodata, max_v, min_v};
+        auto idx_min = gdal::min_element(v.data(), v.size(), eDT, true, nodata);
+        EXPECT_EQ(v[idx_min], min_v);
+        auto idx_max = gdal::max_element(v.data(), v.size(), eDT, true, nodata);
+        EXPECT_EQ(v[idx_max], max_v);
+    }
+    {
+        std::vector<T> v{static_cast<T>((min_v + max_v) / 2),
+                         static_cast<T>(max_v - 1),
+                         max_v,
+                         static_cast<T>(max_v - 1),
+                         static_cast<T>(min_v + 1),
+                         min_v,
+                         static_cast<T>(min_v + 1)};
+        auto idx_min = gdal::min_element(v.data(), v.size(), eDT, false, 0);
+        EXPECT_EQ(v[idx_min], min_v);
+        auto idx_max = gdal::max_element(v.data(), v.size(), eDT, false, 0);
+        EXPECT_EQ(v[idx_max], max_v);
+    }
+    {
+        std::vector<T> v(257, static_cast<T>((min_v + max_v) / 2));
+        v[125] = static_cast<T>(min_v + 1);
+        v[126] = min_v;
+        v[127] = static_cast<T>(min_v + 1);
+        v[128] = static_cast<T>(max_v - 1);
+        v[129] = max_v;
+        v[130] = static_cast<T>(max_v - 1);
+        auto idx_min = gdal::min_element(v.data(), v.size(), eDT, false, 0);
+        EXPECT_EQ(v[idx_min], min_v);
+        auto idx_max = gdal::max_element(v.data(), v.size(), eDT, false, 0);
+        EXPECT_EQ(v[idx_max], max_v);
+    }
+    {
+        std::vector<T> v(259, static_cast<T>((min_v + max_v) / 2));
+        v[0] = min_v;
+        v[256] = static_cast<T>(max_v - 1);
+        v[257] = max_v;
+        v[258] = static_cast<T>(max_v - 1);
+        auto idx_min = gdal::min_element(v.data(), v.size(), eDT, false, 0);
+        EXPECT_EQ(v[idx_min], min_v);
+        auto idx_max = gdal::max_element(v.data(), v.size(), eDT, false, 0);
+        EXPECT_EQ(v[idx_max], max_v);
+    }
+    {
+        std::vector<T> v(257, static_cast<T>((min_v + max_v) / 2));
+        v[0] = min_v;
+        v[127] = max_v;
+        auto idx_min = gdal::min_element(v.data(), v.size(), eDT, false, 0);
+        EXPECT_EQ(v[idx_min], min_v);
+        auto idx_max = gdal::max_element(v.data(), v.size(), eDT, false, 0);
+        EXPECT_EQ(v[idx_max], max_v);
+    }
+    {
+        std::vector<T> v(257, static_cast<T>((min_v + max_v) / 2));
+        v[127] = min_v;
+        v[0] = max_v;
+        auto idx_min = gdal::min_element(v.data(), v.size(), eDT, false, 0);
+        EXPECT_EQ(v[idx_min], min_v);
+        auto idx_max = gdal::max_element(v.data(), v.size(), eDT, false, 0);
+        EXPECT_EQ(v[idx_max], max_v);
+    }
+    {
+        std::vector<T> v(257, static_cast<T>((min_v + max_v) / 2));
+        v[0] = min_v;
+        v[129] = max_v;
+        auto idx_min = gdal::min_element(v.data(), v.size(), eDT, false, 0);
+        EXPECT_EQ(v[idx_min], min_v);
+        auto idx_max = gdal::max_element(v.data(), v.size(), eDT, false, 0);
+        EXPECT_EQ(v[idx_max], max_v);
+    }
+    {
+        std::vector<T> v(257, static_cast<T>((min_v + max_v) / 2));
+        v[129] = min_v;
+        v[0] = max_v;
+        auto idx_min = gdal::min_element(v.data(), v.size(), eDT, false, 0);
+        EXPECT_EQ(v[idx_min], min_v);
+        auto idx_max = gdal::max_element(v.data(), v.size(), eDT, false, 0);
+        EXPECT_EQ(v[idx_max], max_v);
+    }
+    {
+        std::vector<T> v(257, static_cast<T>((min_v + max_v) / 2));
+        v[129] = min_v;
+        v[256] = max_v;
+        auto idx_min = gdal::min_element(v.data(), v.size(), eDT, false, 0);
+        EXPECT_EQ(v[idx_min], min_v);
+        auto idx_max = gdal::max_element(v.data(), v.size(), eDT, false, 0);
+        EXPECT_EQ(v[idx_max], max_v);
+    }
+    {
+        std::vector<T> v(257, static_cast<T>((min_v + max_v) / 2));
+        v[256] = min_v;
+        v[129] = max_v;
+        auto idx_min = gdal::min_element(v.data(), v.size(), eDT, false, 0);
+        EXPECT_EQ(v[idx_min], min_v);
+        auto idx_max = gdal::max_element(v.data(), v.size(), eDT, false, 0);
+        EXPECT_EQ(v[idx_max], max_v);
+    }
+}
+
+TEST_F(test_gdal_minmax_element, int8)
+{
+    using T = int8_t;
+    T min_v = -1;
+    T max_v = 3;
+    constexpr GDALDataType eDT = GDT_Int8;
+    {
+        T nodata = 0;
+        std::vector<T> v{max_v, nodata, min_v};
+        {
+            auto idx_min = gdal::min_element(v.data(), 0, eDT, true, nodata);
+            EXPECT_EQ(idx_min, 0);
+        }
+        {
+            auto idx_min = gdal::min_element(v.data(), 0, eDT, false, 0);
+            EXPECT_EQ(idx_min, 0);
+        }
+        {
+            auto idx_min =
+                gdal::min_element(v.data(), v.size(), eDT, true, nodata);
+            EXPECT_EQ(v[idx_min], min_v);
+            auto idx_max =
+                gdal::max_element(v.data(), v.size(), eDT, true, nodata);
+            EXPECT_EQ(v[idx_max], max_v);
+        }
+        {
+            auto [idx_min, idx_max] =
+                gdal::minmax_element(v.data(), v.size(), eDT, true, nodata);
+            EXPECT_EQ(v[idx_min], min_v);
+            EXPECT_EQ(v[idx_max], max_v);
+        }
+    }
+    {
+        T nodata = 0;
+        std::vector<T> v{nodata, max_v, min_v};
+        auto idx_min = gdal::min_element(v.data(), v.size(), eDT, true, nodata);
+        EXPECT_EQ(v[idx_min], min_v);
+        auto idx_max = gdal::max_element(v.data(), v.size(), eDT, true, nodata);
+        EXPECT_EQ(v[idx_max], max_v);
+    }
+    {
+        std::vector<T> v{static_cast<T>((min_v + max_v) / 2), max_v, min_v};
+        auto idx_min = gdal::min_element(v.data(), v.size(), eDT, false, 0);
+        EXPECT_EQ(v[idx_min], min_v);
+        auto idx_max = gdal::max_element(v.data(), v.size(), eDT, false, 0);
+        EXPECT_EQ(v[idx_max], max_v);
+    }
+    {
+        std::vector<T> v(257, static_cast<T>((min_v + max_v) / 2));
+        v[5] = min_v;
+        v[31] = max_v;
+        auto idx_min = gdal::min_element(v.data(), v.size(), eDT, false, 0);
+        EXPECT_EQ(v[idx_min], min_v);
+        auto idx_max = gdal::max_element(v.data(), v.size(), eDT, false, 0);
+        EXPECT_EQ(v[idx_max], max_v);
+    }
+}
+
+TEST_F(test_gdal_minmax_element, uint16)
+{
+    using T = uint16_t;
+    constexpr GDALDataType eDT = GDT_UInt16;
+    T min_v = 1000;
+    T max_v = 2000;
+    {
+        T nodata = 0;
+        std::vector<T> v{max_v, nodata, min_v};
+        {
+            auto idx_min = gdal::min_element(v.data(), 0, eDT, true, nodata);
+            EXPECT_EQ(idx_min, 0);
+        }
+        {
+            auto idx_min = gdal::min_element(v.data(), 0, eDT, false, 0);
+            EXPECT_EQ(idx_min, 0);
+        }
+        {
+            auto idx_min =
+                gdal::min_element(v.data(), v.size(), eDT, true, nodata);
+            EXPECT_EQ(v[idx_min], min_v);
+            auto idx_max =
+                gdal::max_element(v.data(), v.size(), eDT, true, nodata);
+            EXPECT_EQ(v[idx_max], max_v);
+        }
+        {
+            auto [idx_min, idx_max] =
+                gdal::minmax_element(v.data(), v.size(), eDT, true, nodata);
+            EXPECT_EQ(v[idx_min], min_v);
+            EXPECT_EQ(v[idx_max], max_v);
+        }
+    }
+    {
+        T nodata = 0;
+        std::vector<T> v{nodata, max_v, min_v};
+        auto idx_min = gdal::min_element(v.data(), v.size(), eDT, true, nodata);
+        EXPECT_EQ(v[idx_min], min_v);
+        auto idx_max = gdal::max_element(v.data(), v.size(), eDT, true, nodata);
+        EXPECT_EQ(v[idx_max], max_v);
+    }
+    {
+        std::vector<T> v{static_cast<T>((min_v + max_v) / 2), max_v, min_v};
+        auto idx_min = gdal::min_element(v.data(), v.size(), eDT, false, 0);
+        EXPECT_EQ(v[idx_min], min_v);
+        auto idx_max = gdal::max_element(v.data(), v.size(), eDT, false, 0);
+        EXPECT_EQ(v[idx_max], max_v);
+    }
+    {
+        std::vector<T> v(257, static_cast<T>((min_v + max_v) / 2));
+        v[5] = min_v;
+        v[31] = max_v;
+        auto idx_min = gdal::min_element(v.data(), v.size(), eDT, false, 0);
+        EXPECT_EQ(v[idx_min], min_v);
+        auto idx_max = gdal::max_element(v.data(), v.size(), eDT, false, 0);
+        EXPECT_EQ(v[idx_max], max_v);
+    }
+}
+
+TEST_F(test_gdal_minmax_element, int16)
+{
+    using T = int16_t;
+    constexpr GDALDataType eDT = GDT_Int16;
+    T min_v = -1000;
+    T max_v = 2000;
+    {
+        T nodata = 0;
+        std::vector<T> v{max_v, nodata, min_v};
+        {
+            auto idx_min = gdal::min_element(v.data(), 0, eDT, true, nodata);
+            EXPECT_EQ(idx_min, 0);
+        }
+        {
+            auto idx_min = gdal::min_element(v.data(), 0, eDT, false, 0);
+            EXPECT_EQ(idx_min, 0);
+        }
+        {
+            auto idx_min =
+                gdal::min_element(v.data(), v.size(), eDT, true, nodata);
+            EXPECT_EQ(v[idx_min], min_v);
+            auto idx_max =
+                gdal::max_element(v.data(), v.size(), eDT, true, nodata);
+            EXPECT_EQ(v[idx_max], max_v);
+        }
+        {
+            auto [idx_min, idx_max] =
+                gdal::minmax_element(v.data(), v.size(), eDT, true, nodata);
+            EXPECT_EQ(v[idx_min], min_v);
+            EXPECT_EQ(v[idx_max], max_v);
+        }
+    }
+    {
+        T nodata = 0;
+        std::vector<T> v{nodata, max_v, min_v};
+        auto idx_min = gdal::min_element(v.data(), v.size(), eDT, true, nodata);
+        EXPECT_EQ(v[idx_min], min_v);
+        auto idx_max = gdal::max_element(v.data(), v.size(), eDT, true, nodata);
+        EXPECT_EQ(v[idx_max], max_v);
+    }
+    {
+        std::vector<T> v{static_cast<T>((min_v + max_v) / 2), max_v, min_v};
+        auto idx_min = gdal::min_element(v.data(), v.size(), eDT, false, 0);
+        EXPECT_EQ(v[idx_min], min_v);
+        auto idx_max = gdal::max_element(v.data(), v.size(), eDT, false, 0);
+        EXPECT_EQ(v[idx_max], max_v);
+    }
+    {
+        std::vector<T> v(257, static_cast<T>((min_v + max_v) / 2));
+        v[5] = min_v;
+        v[31] = max_v;
+        auto idx_min = gdal::min_element(v.data(), v.size(), eDT, false, 0);
+        EXPECT_EQ(v[idx_min], min_v);
+        auto idx_max = gdal::max_element(v.data(), v.size(), eDT, false, 0);
+        EXPECT_EQ(v[idx_max], max_v);
+    }
+}
+
+TEST_F(test_gdal_minmax_element, uint32)
+{
+    using T = uint32_t;
+    constexpr GDALDataType eDT = GDT_UInt32;
+    T min_v = 10000000;
+    T max_v = 20000000;
+    {
+        T nodata = 0;
+        std::vector<T> v{max_v, nodata, min_v};
+        {
+            auto idx_min = gdal::min_element(v.data(), 0, eDT, true, nodata);
+            EXPECT_EQ(idx_min, 0);
+        }
+        {
+            auto idx_min = gdal::min_element(v.data(), 0, eDT, false, 0);
+            EXPECT_EQ(idx_min, 0);
+        }
+        {
+            auto idx_min =
+                gdal::min_element(v.data(), v.size(), eDT, true, nodata);
+            EXPECT_EQ(v[idx_min], min_v);
+            auto idx_max =
+                gdal::max_element(v.data(), v.size(), eDT, true, nodata);
+            EXPECT_EQ(v[idx_max], max_v);
+        }
+        {
+            auto [idx_min, idx_max] =
+                gdal::minmax_element(v.data(), v.size(), eDT, true, nodata);
+            EXPECT_EQ(v[idx_min], min_v);
+            EXPECT_EQ(v[idx_max], max_v);
+        }
+    }
+    {
+        T nodata = 0;
+        std::vector<T> v{nodata, max_v, min_v};
+        auto idx_min = gdal::min_element(v.data(), v.size(), eDT, true, nodata);
+        EXPECT_EQ(v[idx_min], min_v);
+        auto idx_max = gdal::max_element(v.data(), v.size(), eDT, true, nodata);
+        EXPECT_EQ(v[idx_max], max_v);
+    }
+    {
+        std::vector<T> v{static_cast<T>((min_v + max_v) / 2), max_v, min_v};
+        auto idx_min = gdal::min_element(v.data(), v.size(), eDT, false, 0);
+        EXPECT_EQ(v[idx_min], min_v);
+        auto idx_max = gdal::max_element(v.data(), v.size(), eDT, false, 0);
+        EXPECT_EQ(v[idx_max], max_v);
+    }
+    {
+        std::vector<T> v(257, static_cast<T>((min_v + max_v) / 2));
+        v[5] = min_v;
+        v[31] = max_v;
+        auto idx_min = gdal::min_element(v.data(), v.size(), eDT, false, 0);
+        EXPECT_EQ(v[idx_min], min_v);
+        auto idx_max = gdal::max_element(v.data(), v.size(), eDT, false, 0);
+        EXPECT_EQ(v[idx_max], max_v);
+    }
+}
+
+TEST_F(test_gdal_minmax_element, int32)
+{
+    using T = int32_t;
+    constexpr GDALDataType eDT = GDT_Int32;
+    T min_v = -10000000;
+    T max_v = 20000000;
+    {
+        T nodata = 0;
+        std::vector<T> v{max_v, nodata, min_v};
+        {
+            auto idx_min = gdal::min_element(v.data(), 0, eDT, true, nodata);
+            EXPECT_EQ(idx_min, 0);
+        }
+        {
+            auto idx_min = gdal::min_element(v.data(), 0, eDT, false, 0);
+            EXPECT_EQ(idx_min, 0);
+        }
+        {
+            auto idx_min =
+                gdal::min_element(v.data(), v.size(), eDT, true, nodata);
+            EXPECT_EQ(v[idx_min], min_v);
+            auto idx_max =
+                gdal::max_element(v.data(), v.size(), eDT, true, nodata);
+            EXPECT_EQ(v[idx_max], max_v);
+        }
+        {
+            auto [idx_min, idx_max] =
+                gdal::minmax_element(v.data(), v.size(), eDT, true, nodata);
+            EXPECT_EQ(v[idx_min], min_v);
+            EXPECT_EQ(v[idx_max], max_v);
+        }
+    }
+    {
+        T nodata = 0;
+        std::vector<T> v{nodata, max_v, min_v};
+        auto idx_min = gdal::min_element(v.data(), v.size(), eDT, true, nodata);
+        EXPECT_EQ(v[idx_min], min_v);
+        auto idx_max = gdal::max_element(v.data(), v.size(), eDT, true, nodata);
+        EXPECT_EQ(v[idx_max], max_v);
+    }
+    {
+        std::vector<T> v{static_cast<T>((min_v + max_v) / 2), max_v, min_v};
+        auto idx_min = gdal::min_element(v.data(), v.size(), eDT, false, 0);
+        EXPECT_EQ(v[idx_min], min_v);
+        auto idx_max = gdal::max_element(v.data(), v.size(), eDT, false, 0);
+        EXPECT_EQ(v[idx_max], max_v);
+    }
+    {
+        std::vector<T> v(257, static_cast<T>((min_v + max_v) / 2));
+        v[5] = min_v;
+        v[31] = max_v;
+        auto idx_min = gdal::min_element(v.data(), v.size(), eDT, false, 0);
+        EXPECT_EQ(v[idx_min], min_v);
+        auto idx_max = gdal::max_element(v.data(), v.size(), eDT, false, 0);
+        EXPECT_EQ(v[idx_max], max_v);
+    }
+}
+
+TEST_F(test_gdal_minmax_element, uint64)
+{
+    using T = uint64_t;
+    constexpr GDALDataType eDT = GDT_UInt64;
+    T min_v = 100000000000000;
+    T max_v = 200000000000000;
+    {
+        double nodata = 0;
+        std::vector<T> v{max_v, static_cast<T>(nodata), min_v};
+        {
+            auto idx_min = gdal::min_element(v.data(), 0, eDT, true, nodata);
+            EXPECT_EQ(idx_min, 0);
+        }
+        {
+            auto idx_min = gdal::min_element(v.data(), 0, eDT, false, 0);
+            EXPECT_EQ(idx_min, 0);
+        }
+        {
+            auto idx_min =
+                gdal::min_element(v.data(), v.size(), eDT, true, nodata);
+            EXPECT_EQ(v[idx_min], min_v);
+            auto idx_max =
+                gdal::max_element(v.data(), v.size(), eDT, true, nodata);
+            EXPECT_EQ(v[idx_max], max_v);
+        }
+        {
+            auto [idx_min, idx_max] =
+                gdal::minmax_element(v.data(), v.size(), eDT, true, nodata);
+            EXPECT_EQ(v[idx_min], min_v);
+            EXPECT_EQ(v[idx_max], max_v);
+        }
+    }
+    {
+        double nodata = 0;
+        std::vector<T> v{static_cast<T>(nodata), max_v, min_v};
+        auto idx_min = gdal::min_element(v.data(), v.size(), eDT, true, nodata);
+        EXPECT_EQ(v[idx_min], min_v);
+        auto idx_max = gdal::max_element(v.data(), v.size(), eDT, true, nodata);
+        EXPECT_EQ(v[idx_max], max_v);
+    }
+    {
+        std::vector<T> v{static_cast<T>((min_v + max_v) / 2), max_v, min_v};
+        auto idx_min = gdal::min_element(v.data(), v.size(), eDT, false, 0);
+        EXPECT_EQ(v[idx_min], min_v);
+        auto idx_max = gdal::max_element(v.data(), v.size(), eDT, false, 0);
+        EXPECT_EQ(v[idx_max], max_v);
+    }
+    {
+        std::vector<T> v(257, static_cast<T>((min_v + max_v) / 2));
+        v[5] = min_v;
+        v[31] = max_v;
+        auto idx_min = gdal::min_element(v.data(), v.size(), eDT, false, 0);
+        EXPECT_EQ(v[idx_min], min_v);
+        auto idx_max = gdal::max_element(v.data(), v.size(), eDT, false, 0);
+        EXPECT_EQ(v[idx_max], max_v);
+    }
+}
+
+TEST_F(test_gdal_minmax_element, int64)
+{
+    using T = int64_t;
+    constexpr GDALDataType eDT = GDT_Int64;
+    T min_v = -100000000000000;
+    T max_v = 200000000000000;
+    {
+        double nodata = 0;
+        std::vector<T> v{max_v, static_cast<T>(nodata), min_v};
+        {
+            auto idx_min = gdal::min_element(v.data(), 0, eDT, true, nodata);
+            EXPECT_EQ(idx_min, 0);
+        }
+        {
+            auto idx_min = gdal::min_element(v.data(), 0, eDT, false, 0);
+            EXPECT_EQ(idx_min, 0);
+        }
+        {
+            auto idx_min =
+                gdal::min_element(v.data(), v.size(), eDT, true, nodata);
+            EXPECT_EQ(v[idx_min], min_v);
+            auto idx_max =
+                gdal::max_element(v.data(), v.size(), eDT, true, nodata);
+            EXPECT_EQ(v[idx_max], max_v);
+        }
+        {
+            auto [idx_min, idx_max] =
+                gdal::minmax_element(v.data(), v.size(), eDT, true, nodata);
+            EXPECT_EQ(v[idx_min], min_v);
+            EXPECT_EQ(v[idx_max], max_v);
+        }
+    }
+    {
+        double nodata = 0;
+        std::vector<T> v{static_cast<T>(nodata), max_v, min_v};
+        auto idx_min = gdal::min_element(v.data(), v.size(), eDT, true, nodata);
+        EXPECT_EQ(v[idx_min], min_v);
+        auto idx_max = gdal::max_element(v.data(), v.size(), eDT, true, nodata);
+        EXPECT_EQ(v[idx_max], max_v);
+    }
+    {
+        std::vector<T> v{static_cast<T>((min_v + max_v) / 2),
+                         max_v - 1,
+                         max_v,
+                         max_v - 1,
+                         min_v + 1,
+                         min_v,
+                         min_v + 1};
+        auto idx_min = gdal::min_element(v.data(), v.size(), eDT, false, 0);
+        EXPECT_EQ(v[idx_min], min_v);
+        auto idx_max = gdal::max_element(v.data(), v.size(), eDT, false, 0);
+        EXPECT_EQ(v[idx_max], max_v);
+    }
+    {
+        std::vector<T> v(257, static_cast<T>((min_v + max_v) / 2));
+        v[5] = min_v;
+        v[31] = max_v;
+        auto idx_min = gdal::min_element(v.data(), v.size(), eDT, false, 0);
+        EXPECT_EQ(v[idx_min], min_v);
+        auto idx_max = gdal::max_element(v.data(), v.size(), eDT, false, 0);
+        EXPECT_EQ(v[idx_max], max_v);
+    }
+}
+
+TEST_F(test_gdal_minmax_element, float32)
+{
+    using T = float;
+    constexpr GDALDataType eDT = GDT_Float32;
+    T min_v = 1.0f;
+    T max_v = 1.5f;
+    {
+        T nodata = 2.0f;
+        std::vector<T> v{max_v, nodata, min_v};
+        {
+            auto idx_min = gdal::min_element(v.data(), 0, eDT, true, nodata);
+            EXPECT_EQ(idx_min, 0);
+        }
+        {
+            auto idx_min = gdal::min_element(v.data(), 0, eDT, false, 0);
+            EXPECT_EQ(idx_min, 0);
+        }
+        {
+            auto idx_min =
+                gdal::min_element(v.data(), v.size(), eDT, true, nodata);
+            EXPECT_EQ(v[idx_min], min_v);
+            auto idx_max =
+                gdal::max_element(v.data(), v.size(), eDT, true, nodata);
+            EXPECT_EQ(v[idx_max], max_v);
+        }
+        {
+            auto [idx_min, idx_max] =
+                gdal::minmax_element(v.data(), v.size(), eDT, true, nodata);
+            EXPECT_EQ(v[idx_min], min_v);
+            EXPECT_EQ(v[idx_max], max_v);
+        }
+    }
+    {
+        T nodata = 2.0f;
+        std::vector<T> v{nodata, max_v, min_v};
+        auto idx_min = gdal::min_element(v.data(), v.size(), eDT, true, nodata);
+        EXPECT_EQ(v[idx_min], min_v);
+        auto idx_max = gdal::max_element(v.data(), v.size(), eDT, true, nodata);
+        EXPECT_EQ(v[idx_max], max_v);
+    }
+    {
+        T nodata = 2.0f;
+        std::vector<T> v{std::numeric_limits<T>::quiet_NaN(),
+                         std::numeric_limits<T>::quiet_NaN(), nodata, max_v,
+                         min_v};
+        auto idx_min = gdal::min_element(v.data(), v.size(), eDT, true, nodata);
+        EXPECT_EQ(v[idx_min], min_v);
+        auto idx_max = gdal::max_element(v.data(), v.size(), eDT, true, nodata);
+        EXPECT_EQ(v[idx_max], max_v);
+    }
+    {
+        T nodata = std::numeric_limits<T>::quiet_NaN();
+        std::vector<T> v{std::numeric_limits<T>::quiet_NaN(),
+                         std::numeric_limits<T>::quiet_NaN(), nodata, max_v,
+                         min_v};
+        auto idx_min = gdal::min_element(v.data(), v.size(), eDT, true, nodata);
+        EXPECT_EQ(v[idx_min], min_v);
+        auto idx_max = gdal::max_element(v.data(), v.size(), eDT, true, nodata);
+        EXPECT_EQ(v[idx_max], max_v);
+    }
+    {
+        std::vector<T> v{std::numeric_limits<T>::quiet_NaN(),
+                         std::numeric_limits<T>::quiet_NaN(),
+                         max_v,
+                         std::numeric_limits<T>::quiet_NaN(),
+                         min_v,
+                         std::numeric_limits<T>::quiet_NaN()};
+        auto idx_min = gdal::min_element(v.data(), v.size(), eDT, false, 0);
+        EXPECT_EQ(v[idx_min], min_v);
+        auto idx_max = gdal::max_element(v.data(), v.size(), eDT, false, 0);
+        EXPECT_EQ(v[idx_max], max_v);
+    }
+    {
+        std::vector<T> v{max_v, std::numeric_limits<T>::quiet_NaN(), min_v};
+        auto idx_min = gdal::min_element(v.data(), v.size(), eDT, false, 0);
+        EXPECT_EQ(v[idx_min], min_v);
+        auto idx_max = gdal::max_element(v.data(), v.size(), eDT, false, 0);
+        EXPECT_EQ(v[idx_max], max_v);
+    }
+    {
+        std::vector<T> v(257, std::numeric_limits<T>::quiet_NaN());
+        v[125] = static_cast<T>(min_v + 0.1f);
+        v[126] = min_v;
+        v[127] = static_cast<T>(min_v + 0.1f);
+        v[128] = static_cast<T>(max_v - 0.1f);
+        v[129] = max_v;
+        v[130] = static_cast<T>(max_v - 0.1f);
+        auto idx_min = gdal::min_element(v.data(), v.size(), eDT, false, 0);
+        EXPECT_EQ(v[idx_min], min_v);
+        auto idx_max = gdal::max_element(v.data(), v.size(), eDT, false, 0);
+        EXPECT_EQ(v[idx_max], max_v);
+    }
+    {
+        std::vector<T> v(33, 1.2f);
+        v[5] = min_v;
+        v[15] = max_v;
+        auto idx_min = gdal::min_element(v.data(), v.size(), eDT, false, 0);
+        EXPECT_EQ(v[idx_min], min_v);
+        auto idx_max = gdal::max_element(v.data(), v.size(), eDT, false, 0);
+        EXPECT_EQ(v[idx_max], max_v);
+    }
+    {
+        std::vector<T> v(255, std::numeric_limits<T>::quiet_NaN());
+        v[v.size() - 2] = min_v;
+        v.back() = max_v;
+        auto idx_min = gdal::min_element(v.data(), v.size(), eDT, false, 0);
+        EXPECT_EQ(v[idx_min], min_v);
+        auto idx_max = gdal::max_element(v.data(), v.size(), eDT, false, 0);
+        EXPECT_EQ(v[idx_max], max_v);
+    }
+}
+
+TEST_F(test_gdal_minmax_element, float64)
+{
+    using T = double;
+    constexpr GDALDataType eDT = GDT_Float64;
+    T min_v = 1.0;
+    T max_v = 1.5;
+    {
+        T nodata = 2.0;
+        std::vector<T> v{max_v, nodata, min_v};
+        {
+            auto idx_min = gdal::min_element(v.data(), 0, eDT, true, nodata);
+            EXPECT_EQ(idx_min, 0);
+        }
+        {
+            auto idx_min = gdal::min_element(v.data(), 0, eDT, false, 0);
+            EXPECT_EQ(idx_min, 0);
+        }
+        {
+            auto idx_min =
+                gdal::min_element(v.data(), v.size(), eDT, true, nodata);
+            EXPECT_EQ(v[idx_min], min_v);
+            auto idx_max =
+                gdal::max_element(v.data(), v.size(), eDT, true, nodata);
+            EXPECT_EQ(v[idx_max], max_v);
+        }
+        {
+            auto [idx_min, idx_max] =
+                gdal::minmax_element(v.data(), v.size(), eDT, true, nodata);
+            EXPECT_EQ(v[idx_min], min_v);
+            EXPECT_EQ(v[idx_max], max_v);
+        }
+    }
+    {
+        T nodata = 2.0;
+        std::vector<T> v{nodata, max_v, min_v};
+        auto idx_min = gdal::min_element(v.data(), v.size(), eDT, true, nodata);
+        EXPECT_EQ(v[idx_min], min_v);
+        auto idx_max = gdal::max_element(v.data(), v.size(), eDT, true, nodata);
+        EXPECT_EQ(v[idx_max], max_v);
+    }
+    {
+        T nodata = 2.0;
+        std::vector<T> v{std::numeric_limits<T>::quiet_NaN(),
+                         std::numeric_limits<T>::quiet_NaN(), nodata, max_v,
+                         min_v};
+        auto idx_min = gdal::min_element(v.data(), v.size(), eDT, true, nodata);
+        EXPECT_EQ(v[idx_min], min_v);
+        auto idx_max = gdal::max_element(v.data(), v.size(), eDT, true, nodata);
+        EXPECT_EQ(v[idx_max], max_v);
+    }
+    {
+        std::vector<T> v{max_v, std::numeric_limits<T>::quiet_NaN(), min_v};
+        auto idx_min = gdal::min_element(v.data(), v.size(), eDT, false, 0);
+        EXPECT_EQ(v[idx_min], min_v);
+        auto idx_max = gdal::max_element(v.data(), v.size(), eDT, false, 0);
+        EXPECT_EQ(v[idx_max], max_v);
+    }
+    {
+        std::vector<T> v{std::numeric_limits<T>::quiet_NaN(),
+                         std::numeric_limits<T>::quiet_NaN(),
+                         max_v,
+                         std::numeric_limits<T>::quiet_NaN(),
+                         min_v,
+                         std::numeric_limits<T>::quiet_NaN()};
+        auto idx_min = gdal::min_element(v.data(), v.size(), eDT, false, 0);
+        EXPECT_EQ(v[idx_min], min_v);
+        auto idx_max = gdal::max_element(v.data(), v.size(), eDT, false, 0);
+        EXPECT_EQ(v[idx_max], max_v);
+    }
+    {
+        std::vector<T> v(33, std::numeric_limits<T>::quiet_NaN());
+        v[5] = min_v;
+        v[15] = max_v;
+        auto idx_min = gdal::min_element(v.data(), v.size(), eDT, false, 0);
+        EXPECT_EQ(v[idx_min], min_v);
+        auto idx_max = gdal::max_element(v.data(), v.size(), eDT, false, 0);
+        EXPECT_EQ(v[idx_max], max_v);
+    }
+    {
+        std::vector<T> v(255, std::numeric_limits<T>::quiet_NaN());
+        v[v.size() - 2] = min_v;
+        v.back() = max_v;
+        auto idx_min = gdal::min_element(v.data(), v.size(), eDT, false, 0);
+        EXPECT_EQ(v[idx_min], min_v);
+        auto idx_max = gdal::max_element(v.data(), v.size(), eDT, false, 0);
+        EXPECT_EQ(v[idx_max], max_v);
+    }
+}
+
+TEST_F(test_gdal_minmax_element, unsupported)
+{
+    float v[2] = {0, 0};
+    CPLErrorHandlerPusher oErrorHandler(CPLQuietErrorHandler);
+    {
+        CPLErrorReset();
+        EXPECT_EQ(gdal::min_element(v, 1, GDT_CFloat32, false, 0), 0);
+        EXPECT_EQ(CPLGetLastErrorNo(), CPLE_NotSupported);
+    }
+    {
+        CPLErrorReset();
+        EXPECT_EQ(gdal::max_element(v, 1, GDT_CFloat32, false, 0), 0);
+        EXPECT_EQ(CPLGetLastErrorNo(), CPLE_NotSupported);
+    }
+    {
+        CPLErrorReset();
+        auto [idx_min, idx_max] =
+            gdal::minmax_element(v, 1, GDT_CFloat32, false, 0);
+        EXPECT_EQ(idx_min, 0);
+        EXPECT_EQ(idx_max, 0);
+        EXPECT_EQ(CPLGetLastErrorNo(), CPLE_NotSupported);
+    }
+}
+
+}  // namespace
diff --git a/gcore/CMakeLists.txt b/gcore/CMakeLists.txt
index 57a114bc3205..08fe7e646b7b 100644
--- a/gcore/CMakeLists.txt
+++ b/gcore/CMakeLists.txt
@@ -209,6 +209,7 @@ target_public_header(
   gdalsubdatasetinfo.h
   gdal_typetraits.h
   gdal_adbc.h
+  gdal_minmax_element.hpp
 )
 
 set(GDAL_DATA_FILES
diff --git a/gcore/gdal_minmax_element.hpp b/gcore/gdal_minmax_element.hpp
new file mode 100644
index 000000000000..cf4e6a93aafc
--- /dev/null
+++ b/gcore/gdal_minmax_element.hpp
@@ -0,0 +1,971 @@
+/******************************************************************************
+ * Project:  GDAL Core
+ * Purpose:  Utility functions to find minimum and maximum values in a buffer
+ * Author:   Even Rouault, <even dot rouault at spatialys.com>
+ *
+ ******************************************************************************
+ * Copyright (c) 2024, Even Rouault <even dot rouault at spatialys.com>
+ *
+ * SPDX-License-Identifier: MIT
+ ****************************************************************************/
+
+#ifndef GDAL_MINMAX_ELEMENT_INCLUDED
+#define GDAL_MINMAX_ELEMENT_INCLUDED
+
+// NOTE: This header requires C++17
+
+// This file may be vendored by other applications than GDAL
+// WARNING: if modifying this file, please also update the upstream GDAL version
+// at https://github.com/OSGeo/gdal/blob/master/gcore/gdal_minmax_element.hpp
+
+#include <algorithm>
+#include <cmath>
+#include <cstdint>
+#include <limits>
+#include <type_traits>
+#include <utility>
+
+#include "gdal.h"
+
+#ifdef GDAL_COMPILATION
+#define GDAL_MINMAXELT_NS gdal
+#elif !defined(GDAL_MINMAXELT_NS)
+#error "Please define the GDAL_MINMAXELT_NS macro to define the namespace"
+#endif
+
+#if defined(__x86_64) || defined(_M_X64)
+// SSE2 header
+#include <emmintrin.h>
+#endif
+
+#include "gdal_priv_templates.hpp"
+#if GDAL_VERSION < GDAL_COMPUTE_VERSION(3, 10, 0)
+// For vendoring in other applications
+namespace GDAL_MINMAXELT_NS
+{
+template <class T> inline bool GDALIsValueExactAs(double dfValue)
+{
+    return GDALIsValueInRange<T>(dfValue) &&
+           static_cast<double>(static_cast<T>(dfValue)) == dfValue;
+}
+
+template <> inline bool GDALIsValueExactAs<float>(double dfValue)
+{
+    return std::isnan(dfValue) ||
+           (GDALIsValueInRange<float>(dfValue) &&
+            static_cast<double>(static_cast<float>(dfValue)) == dfValue);
+}
+
+template <> inline bool GDALIsValueExactAs<double>(double)
+{
+    return true;
+}
+}  // namespace GDAL_MINMAXELT_NS
+#endif
+
+namespace GDAL_MINMAXELT_NS
+{
+namespace detail
+{
+
+/************************************************************************/
+/*                            compScalar()                              */
+/************************************************************************/
+
+template <class T, bool IS_MAX> inline static bool compScalar(T x, T y)
+{
+    if constexpr (IS_MAX)
+        return x > y;
+    else
+        return x < y;
+}
+
+/************************************************************************/
+/*                         extremum_element()                           */
+/************************************************************************/
+
+template <class T, bool IS_MAX>
+size_t extremum_element(const T *v, size_t size, T noDataValue)
+{
+    static_assert(!(std::is_floating_point_v<T>));
+    if (size == 0)
+        return 0;
+    size_t idx_of_extremum = 0;
+    T extremum = v[0];
+    bool extremum_is_nodata = extremum == noDataValue;
+    size_t i = 1;
+    for (; i < size; ++i)
+    {
+        if (v[i] != noDataValue &&
+            (compScalar<T, IS_MAX>(v[i], extremum) || extremum_is_nodata))
+        {
+            extremum = v[i];
+            idx_of_extremum = i;
+            extremum_is_nodata = false;
+        }
+    }
+    return idx_of_extremum;
+}
+
+/************************************************************************/
+/*                         extremum_element()                           */
+/************************************************************************/
+
+template <class T, bool IS_MAX> size_t extremum_element(const T *v, size_t size)
+{
+    static_assert(!(std::is_floating_point_v<T>));
+    if (size == 0)
+        return 0;
+    size_t idx_of_extremum = 0;
+    T extremum = v[0];
+    size_t i = 1;
+    for (; i < size; ++i)
+    {
+        if (compScalar<T, IS_MAX>(v[i], extremum))
+        {
+            extremum = v[i];
+            idx_of_extremum = i;
+        }
+    }
+    return idx_of_extremum;
+}
+
+#if defined(__x86_64) || defined(_M_X64)
+
+/************************************************************************/
+/*                    extremum_element_with_nan()                       */
+/************************************************************************/
+
+static inline int8_t Shift8(uint8_t x)
+{
+    return static_cast<int8_t>(x + std::numeric_limits<int8_t>::min());
+}
+
+static inline int16_t Shift16(uint16_t x)
+{
+    return static_cast<int16_t>(x + std::numeric_limits<int16_t>::min());
+}
+
+CPL_NOSANITIZE_UNSIGNED_INT_OVERFLOW
+static inline int32_t Shift32(uint32_t x)
+{
+    x += static_cast<uint32_t>(std::numeric_limits<int32_t>::min());
+    int32_t ret;
+    memcpy(&ret, &x, sizeof(x));
+    return ret;
+}
+
+// Return a _mm128[i|d] register with all its elements set to x
+template <class T> static inline auto set1(T x)
+{
+    if constexpr (std::is_same_v<T, uint8_t>)
+        return _mm_set1_epi8(Shift8(x));
+    else if constexpr (std::is_same_v<T, int8_t>)
+        return _mm_set1_epi8(x);
+    else if constexpr (std::is_same_v<T, uint16_t>)
+        return _mm_set1_epi16(Shift16(x));
+    else if constexpr (std::is_same_v<T, int16_t>)
+        return _mm_set1_epi16(x);
+    else if constexpr (std::is_same_v<T, uint32_t>)
+        return _mm_set1_epi32(Shift32(x));
+    else if constexpr (std::is_same_v<T, int32_t>)
+        return _mm_set1_epi32(x);
+    else if constexpr (std::is_same_v<T, float>)
+        return _mm_set1_ps(x);
+    else
+        return _mm_set1_pd(x);
+}
+
+// Load as many values of type T at a _mm128[i|d] register can contain from x
+template <class T> static inline auto loadv(const T *x)
+{
+    if constexpr (std::is_same_v<T, float>)
+        return _mm_loadu_ps(x);
+    else if constexpr (std::is_same_v<T, double>)
+        return _mm_loadu_pd(x);
+    else
+        return _mm_loadu_si128(reinterpret_cast<const __m128i *>(x));
+}
+
+// Return a __m128i register with bits set when x[i] < y[i] when !IS_MAX
+// or x[i] > y[i] when IS_MAX
+template <class T, bool IS_MAX, class SSE_T>
+static inline __m128i comp(SSE_T x, SSE_T y)
+{
+    if constexpr (IS_MAX)
+    {
+        if constexpr (std::is_same_v<T, uint8_t>)
+            return _mm_cmpgt_epi8(
+                _mm_add_epi8(x,
+                             _mm_set1_epi8(std::numeric_limits<int8_t>::min())),
+                y);
+        else if constexpr (std::is_same_v<T, int8_t>)
+            return _mm_cmpgt_epi8(x, y);
+        else if constexpr (std::is_same_v<T, uint16_t>)
+            return _mm_cmpgt_epi16(
+                _mm_add_epi16(
+                    x, _mm_set1_epi16(std::numeric_limits<int16_t>::min())),
+                y);
+        else if constexpr (std::is_same_v<T, int16_t>)
+            return _mm_cmpgt_epi16(x, y);
+        else if constexpr (std::is_same_v<T, uint32_t>)
+            return _mm_cmpgt_epi32(
+                _mm_add_epi32(
+                    x, _mm_set1_epi32(std::numeric_limits<int32_t>::min())),
+                y);
+        else if constexpr (std::is_same_v<T, int32_t>)
+            return _mm_cmpgt_epi32(x, y);
+        // We could use _mm_cmpgt_pX() if there was no NaN values
+        else if constexpr (std::is_same_v<T, float>)
+            return _mm_castps_si128(_mm_cmpnle_ps(x, y));
+        else
+            return _mm_castpd_si128(_mm_cmpnle_pd(x, y));
+    }
+    else
+    {
+        if constexpr (std::is_same_v<T, uint8_t>)
+            return _mm_cmplt_epi8(
+                _mm_add_epi8(x,
+                             _mm_set1_epi8(std::numeric_limits<int8_t>::min())),
+                y);
+        else if constexpr (std::is_same_v<T, int8_t>)
+            return _mm_cmplt_epi8(x, y);
+        else if constexpr (std::is_same_v<T, uint16_t>)
+            return _mm_cmplt_epi16(
+                _mm_add_epi16(
+                    x, _mm_set1_epi16(std::numeric_limits<int16_t>::min())),
+                y);
+        else if constexpr (std::is_same_v<T, int16_t>)
+            return _mm_cmplt_epi16(x, y);
+        else if constexpr (std::is_same_v<T, uint32_t>)
+            return _mm_cmplt_epi32(
+                _mm_add_epi32(
+                    x, _mm_set1_epi32(std::numeric_limits<int32_t>::min())),
+                y);
+        else if constexpr (std::is_same_v<T, int32_t>)
+            return _mm_cmplt_epi32(x, y);
+        // We could use _mm_cmplt_pX() if there was no NaN values
+        else if constexpr (std::is_same_v<T, float>)
+            return _mm_castps_si128(_mm_cmpnge_ps(x, y));
+        else
+            return _mm_castpd_si128(_mm_cmpnge_pd(x, y));
+    }
+}
+
+// Using SSE2
+template <class T, bool IS_MAX>
+inline size_t extremum_element_with_nan(const T *v, size_t size)
+{
+    static_assert(std::is_same_v<T, uint8_t> || std::is_same_v<T, int8_t> ||
+                  std::is_same_v<T, uint16_t> || std::is_same_v<T, int16_t> ||
+                  std::is_same_v<T, uint32_t> || std::is_same_v<T, int32_t> ||
+                  std::is_floating_point_v<T>);
+    if (size == 0)
+        return 0;
+    size_t idx_of_extremum = 0;
+    T extremum = v[0];
+    [[maybe_unused]] bool extremum_is_nan = std::isnan(extremum);
+    size_t i = 1;
+
+    constexpr size_t VALS_PER_REG = sizeof(set1(extremum)) / sizeof(extremum);
+    constexpr int LOOP_UNROLLING = 4;
+    // If changing the value, then we need to adjust the number of sse_valX
+    // loading in the loop.
+    static_assert(LOOP_UNROLLING == 4);
+    constexpr size_t VALS_PER_ITER = VALS_PER_REG * LOOP_UNROLLING;
+
+    const auto update =
+        [v, &extremum, &idx_of_extremum, &extremum_is_nan](size_t idx)
+    {
+        if (compScalar<T, IS_MAX>(v[idx], extremum))
+        {
+            extremum = v[idx];
+            idx_of_extremum = idx;
+            extremum_is_nan = false;
+        }
+        else if constexpr (std::is_floating_point_v<T>)
+        {
+            if (extremum_is_nan && !std::isnan(v[idx]))
+            {
+                extremum = v[idx];
+                idx_of_extremum = idx;
+                extremum_is_nan = false;
+            }
+        }
+    };
+
+    for (; i < VALS_PER_ITER && i < size; ++i)
+    {
+        update(i);
+    }
+
+    auto sse_extremum = set1(extremum);
+
+    [[maybe_unused]] size_t hits = 0;
+    const auto sse_iter_count = (size / VALS_PER_ITER) * VALS_PER_ITER;
+    for (; i < sse_iter_count; i += VALS_PER_ITER)
+    {
+        // A bit of loop unrolling to save 3/4 of slow movemask operations.
+        const auto sse_val0 = loadv(v + i + 0 * VALS_PER_REG);
+        const auto sse_val1 = loadv(v + i + 1 * VALS_PER_REG);
+        const auto sse_val2 = loadv(v + i + 2 * VALS_PER_REG);
+        const auto sse_val3 = loadv(v + i + 3 * VALS_PER_REG);
+        if (_mm_movemask_epi8(_mm_or_si128(
+                _mm_or_si128(comp<T, IS_MAX>(sse_val0, sse_extremum),
+                             comp<T, IS_MAX>(sse_val1, sse_extremum)),
+                _mm_or_si128(comp<T, IS_MAX>(sse_val2, sse_extremum),
+                             comp<T, IS_MAX>(sse_val3, sse_extremum)))) != 0)
+        {
+            if constexpr (!std::is_same_v<T, int8_t> &&
+                          !std::is_same_v<T, uint8_t>)
+            {
+                // The above tests excluding int8_t/uint8_t is due to the fact
+                // with those small ranges of values we will quickly converge
+                // to the minimum, so no need to do the below "smart" test.
+
+                if (++hits == size / 16)
+                {
+                    // If we have an almost sorted array, then using this code path
+                    // will hurt performance. Arbitrary give up if we get here
+                    // more than 1. / 16 of the size of the array.
+                    // fprintf(stderr, "going to non-vector path\n");
+                    break;
+                }
+            }
+            for (size_t j = 0; j < VALS_PER_ITER; j++)
+            {
+                update(i + j);
+            }
+            sse_extremum = set1(extremum);
+        }
+    }
+    for (; i < size; ++i)
+    {
+        update(i);
+    }
+    return idx_of_extremum;
+}
+
+#else
+
+/************************************************************************/
+/*                    extremum_element_with_nan()                       */
+/************************************************************************/
+
+template <class T, bool IS_MAX>
+inline size_t extremum_element_with_nan(const T *v, size_t size)
+{
+    if (size == 0)
+        return 0;
+    size_t idx_of_extremum = 0;
+    auto extremum = v[0];
+    bool extremum_is_nan = std::isnan(extremum);
+    size_t i = 1;
+    for (; i < size; ++i)
+    {
+        if (compScalar<T, IS_MAX>(v[i], extremum) ||
+            (extremum_is_nan && !std::isnan(v[i])))
+        {
+            extremum = v[i];
+            idx_of_extremum = i;
+            extremum_is_nan = false;
+        }
+    }
+    return idx_of_extremum;
+}
+#endif
+
+/************************************************************************/
+/*                         extremum_element()                           */
+/************************************************************************/
+
+#if defined(__x86_64) || defined(_M_X64)
+
+template <>
+size_t extremum_element<uint8_t, true>(const uint8_t *v, size_t size)
+{
+    return extremum_element_with_nan<uint8_t, true>(v, size);
+}
+
+template <>
+size_t extremum_element<uint8_t, false>(const uint8_t *v, size_t size)
+{
+    return extremum_element_with_nan<uint8_t, false>(v, size);
+}
+
+template <> size_t extremum_element<int8_t, true>(const int8_t *v, size_t size)
+{
+    return extremum_element_with_nan<int8_t, true>(v, size);
+}
+
+template <> size_t extremum_element<int8_t, false>(const int8_t *v, size_t size)
+{
+    return extremum_element_with_nan<int8_t, false>(v, size);
+}
+
+template <>
+size_t extremum_element<uint16_t, true>(const uint16_t *v, size_t size)
+{
+    return extremum_element_with_nan<uint16_t, true>(v, size);
+}
+
+template <>
+size_t extremum_element<uint16_t, false>(const uint16_t *v, size_t size)
+{
+    return extremum_element_with_nan<uint16_t, false>(v, size);
+}
+
+template <>
+size_t extremum_element<int16_t, true>(const int16_t *v, size_t size)
+{
+    return extremum_element_with_nan<int16_t, true>(v, size);
+}
+
+template <>
+size_t extremum_element<int16_t, false>(const int16_t *v, size_t size)
+{
+    return extremum_element_with_nan<int16_t, false>(v, size);
+}
+
+template <>
+size_t extremum_element<uint32_t, true>(const uint32_t *v, size_t size)
+{
+    return extremum_element_with_nan<uint32_t, true>(v, size);
+}
+
+template <>
+size_t extremum_element<uint32_t, false>(const uint32_t *v, size_t size)
+{
+    return extremum_element_with_nan<uint32_t, false>(v, size);
+}
+
+template <>
+size_t extremum_element<int32_t, true>(const int32_t *v, size_t size)
+{
+    return extremum_element_with_nan<int32_t, true>(v, size);
+}
+
+template <>
+size_t extremum_element<int32_t, false>(const int32_t *v, size_t size)
+{
+    return extremum_element_with_nan<int32_t, false>(v, size);
+}
+
+#endif
+
+template <> size_t extremum_element<float, true>(const float *v, size_t size)
+{
+    return extremum_element_with_nan<float, true>(v, size);
+}
+
+template <> size_t extremum_element<double, true>(const double *v, size_t size)
+{
+    return extremum_element_with_nan<double, true>(v, size);
+}
+
+template <> size_t extremum_element<float, false>(const float *v, size_t size)
+{
+    return extremum_element_with_nan<float, false>(v, size);
+}
+
+template <> size_t extremum_element<double, false>(const double *v, size_t size)
+{
+    return extremum_element_with_nan<double, false>(v, size);
+}
+
+/************************************************************************/
+/*                       extremum_element_with_nan()                    */
+/************************************************************************/
+
+template <class T, bool IS_MAX>
+inline size_t extremum_element_with_nan(const T *v, size_t size, T noDataValue)
+{
+    if (std::isnan(noDataValue))
+        return extremum_element_with_nan<T, IS_MAX>(v, size);
+    if (size == 0)
+        return 0;
+    size_t idx_of_extremum = 0;
+    auto extremum = v[0];
+    bool extremum_is_nan_or_nodata =
+        std::isnan(extremum) || (extremum == noDataValue);
+    size_t i = 1;
+    for (; i < size; ++i)
+    {
+        if (v[i] != noDataValue &&
+            (compScalar<T, IS_MAX>(v[i], extremum) ||
+             (extremum_is_nan_or_nodata && !std::isnan(v[i]))))
+        {
+            extremum = v[i];
+            idx_of_extremum = i;
+            extremum_is_nan_or_nodata = false;
+        }
+    }
+    return idx_of_extremum;
+}
+
+/************************************************************************/
+/*                            extremum_element()                        */
+/************************************************************************/
+
+template <>
+size_t extremum_element<float, true>(const float *v, size_t size,
+                                     float noDataValue)
+{
+    return extremum_element_with_nan<float, true>(v, size, noDataValue);
+}
+
+template <>
+size_t extremum_element<double, true>(const double *v, size_t size,
+                                      double noDataValue)
+{
+    return extremum_element_with_nan<double, true>(v, size, noDataValue);
+}
+
+template <>
+size_t extremum_element<float, false>(const float *v, size_t size,
+                                      float noDataValue)
+{
+    return extremum_element_with_nan<float, false>(v, size, noDataValue);
+}
+
+template <>
+size_t extremum_element<double, false>(const double *v, size_t size,
+                                       double noDataValue)
+{
+    return extremum_element_with_nan<double, false>(v, size, noDataValue);
+}
+
+template <class T, bool IS_MAX>
+inline size_t extremum_element(const T *buffer, size_t size, bool bHasNoData,
+                               T noDataValue)
+{
+    if (bHasNoData)
+        return extremum_element<T, IS_MAX>(buffer, size, noDataValue);
+    else
+        return extremum_element<T, IS_MAX>(buffer, size);
+}
+
+template <bool IS_MAX>
+size_t extremum_element(const void *buffer, size_t nElts, GDALDataType eDT,
+                        bool bHasNoData, double dfNoDataValue)
+{
+    switch (eDT)
+    {
+        case GDT_Int8:
+        {
+            using T = int8_t;
+            bHasNoData = bHasNoData && GDALIsValueExactAs<T>(dfNoDataValue);
+            return extremum_element<T, IS_MAX>(
+                static_cast<const T *>(buffer), nElts, bHasNoData,
+                bHasNoData ? static_cast<T>(dfNoDataValue) : 0);
+        }
+        case GDT_Byte:
+        {
+            using T = uint8_t;
+            bHasNoData = bHasNoData && GDALIsValueExactAs<T>(dfNoDataValue);
+            return extremum_element<T, IS_MAX>(
+                static_cast<const T *>(buffer), nElts, bHasNoData,
+                bHasNoData ? static_cast<T>(dfNoDataValue) : 0);
+        }
+        case GDT_Int16:
+        {
+            using T = int16_t;
+            bHasNoData = bHasNoData && GDALIsValueExactAs<T>(dfNoDataValue);
+            return extremum_element<T, IS_MAX>(
+                static_cast<const T *>(buffer), nElts, bHasNoData,
+                bHasNoData ? static_cast<T>(dfNoDataValue) : 0);
+        }
+        case GDT_UInt16:
+        {
+            using T = uint16_t;
+            bHasNoData = bHasNoData && GDALIsValueExactAs<T>(dfNoDataValue);
+            return extremum_element<T, IS_MAX>(
+                static_cast<const T *>(buffer), nElts, bHasNoData,
+                bHasNoData ? static_cast<T>(dfNoDataValue) : 0);
+        }
+        case GDT_Int32:
+        {
+            using T = int32_t;
+            bHasNoData = bHasNoData && GDALIsValueExactAs<T>(dfNoDataValue);
+            return extremum_element<T, IS_MAX>(
+                static_cast<const T *>(buffer), nElts, bHasNoData,
+                bHasNoData ? static_cast<T>(dfNoDataValue) : 0);
+        }
+        case GDT_UInt32:
+        {
+            using T = uint32_t;
+            bHasNoData = bHasNoData && GDALIsValueExactAs<T>(dfNoDataValue);
+            return extremum_element<T, IS_MAX>(
+                static_cast<const T *>(buffer), nElts, bHasNoData,
+                bHasNoData ? static_cast<T>(dfNoDataValue) : 0);
+        }
+        case GDT_Int64:
+        {
+            using T = int64_t;
+            bHasNoData = bHasNoData && GDALIsValueExactAs<T>(dfNoDataValue);
+            return extremum_element<T, IS_MAX>(
+                static_cast<const T *>(buffer), nElts, bHasNoData,
+                bHasNoData ? static_cast<T>(dfNoDataValue) : 0);
+        }
+        case GDT_UInt64:
+        {
+            using T = uint64_t;
+            bHasNoData = bHasNoData && GDALIsValueExactAs<T>(dfNoDataValue);
+            return extremum_element<T, IS_MAX>(
+                static_cast<const T *>(buffer), nElts, bHasNoData,
+                bHasNoData ? static_cast<T>(dfNoDataValue) : 0);
+        }
+        case GDT_Float32:
+        {
+            using T = float;
+            bHasNoData = bHasNoData && GDALIsValueExactAs<T>(dfNoDataValue);
+            return extremum_element<T, IS_MAX>(
+                static_cast<const T *>(buffer), nElts, bHasNoData,
+                bHasNoData ? static_cast<T>(dfNoDataValue) : 0);
+        }
+        case GDT_Float64:
+        {
+            using T = double;
+            bHasNoData = bHasNoData && GDALIsValueExactAs<T>(dfNoDataValue);
+            return extremum_element<T, IS_MAX>(
+                static_cast<const T *>(buffer), nElts, bHasNoData,
+                bHasNoData ? static_cast<T>(dfNoDataValue) : 0);
+        }
+        default:
+            break;
+    }
+    CPLError(CE_Failure, CPLE_NotSupported,
+             "%s not supported for this data type.", __FUNCTION__);
+    return 0;
+}
+
+}  // namespace detail
+
+/************************************************************************/
+/*                            max_element()                             */
+/************************************************************************/
+
+/** Return the index of the element where the maximum value is hit.
+ *
+ * If it is hit in several locations, it is not specified which one will be
+ * returned.
+ *
+ * @param buffer Vector of nElts elements of type eDT.
+ * @param nElts Number of elements in buffer.
+ * @param eDT Data type of the elements of buffer.
+ * @param bHasNoData Whether dfNoDataValue is valid.
+ * @param dfNoDataValue Nodata value, only taken into account if bHasNoData == true
+ *
+ * @since GDAL 3.11
+ */
+inline size_t max_element(const void *buffer, size_t nElts, GDALDataType eDT,
+                          bool bHasNoData, double dfNoDataValue)
+{
+    return detail::extremum_element<true>(buffer, nElts, eDT, bHasNoData,
+                                          dfNoDataValue);
+}
+
+/************************************************************************/
+/*                            min_element()                             */
+/************************************************************************/
+
+/** Return the index of the element where the minimum value is hit.
+ *
+ * If it is hit in several locations, it is not specified which one will be
+ * returned.
+ *
+ * @param buffer Vector of nElts elements of type eDT.
+ * @param nElts Number of elements in buffer.
+ * @param eDT Data type of the elements of buffer.
+ * @param bHasNoData Whether dfNoDataValue is valid.
+ * @param dfNoDataValue Nodata value, only taken into account if bHasNoData == true
+ *
+ * @since GDAL 3.11
+ */
+inline size_t min_element(const void *buffer, size_t nElts, GDALDataType eDT,
+                          bool bHasNoData, double dfNoDataValue)
+{
+    return detail::extremum_element<false>(buffer, nElts, eDT, bHasNoData,
+                                           dfNoDataValue);
+}
+
+namespace detail
+{
+
+#ifdef NOT_EFFICIENT
+
+/************************************************************************/
+/*                         minmax_element()                             */
+/************************************************************************/
+
+template <class T>
+std::pair<size_t, size_t> minmax_element(const T *v, size_t size, T noDataValue)
+{
+    static_assert(!(std::is_floating_point_v<T>));
+    if (size == 0)
+        return std::pair(0, 0);
+    size_t idx_of_min = 0;
+    size_t idx_of_max = 0;
+    T vmin = v[0];
+    T vmax = v[0];
+    bool extremum_is_nodata = vmin == noDataValue;
+    size_t i = 1;
+    for (; i < size; ++i)
+    {
+        if (v[i] != noDataValue && (v[i] < vmin || extremum_is_nodata))
+        {
+            vmin = v[i];
+            idx_of_min = i;
+            extremum_is_nodata = false;
+        }
+        if (v[i] != noDataValue && (v[i] > vmax || extremum_is_nodata))
+        {
+            vmax = v[i];
+            idx_of_max = i;
+            extremum_is_nodata = false;
+        }
+    }
+    return std::pair(idx_of_min, idx_of_max);
+}
+
+template <class T>
+std::pair<size_t, size_t> minmax_element(const T *v, size_t size)
+{
+    static_assert(!(std::is_floating_point_v<T>));
+    if (size == 0)
+        return std::pair(0, 0);
+    size_t idx_of_min = 0;
+    size_t idx_of_max = 0;
+    T vmin = v[0];
+    T vmax = v[0];
+    size_t i = 1;
+    for (; i < size; ++i)
+    {
+        if (v[i] < vmin)
+        {
+            vmin = v[i];
+            idx_of_min = i;
+        }
+        if (v[i] > vmax)
+        {
+            vmax = v[i];
+            idx_of_max = i;
+        }
+    }
+    return std::pair(idx_of_min, idx_of_max);
+}
+
+template <class T>
+inline std::pair<size_t, size_t> minmax_element_with_nan(const T *v,
+                                                         size_t size)
+{
+    if (size == 0)
+        return std::pair(0, 0);
+    size_t idx_of_min = 0;
+    size_t idx_of_max = 0;
+    T vmin = v[0];
+    T vmax = v[0];
+    size_t i = 1;
+    if (std::isnan(v[0]))
+    {
+        for (; i < size; ++i)
+        {
+            if (!std::isnan(v[i]))
+            {
+                vmin = v[i];
+                idx_of_min = i;
+                vmax = v[i];
+                idx_of_max = i;
+                break;
+            }
+        }
+    }
+    for (; i < size; ++i)
+    {
+        if (v[i] < vmin)
+        {
+            vmin = v[i];
+            idx_of_min = i;
+        }
+        if (v[i] > vmax)
+        {
+            vmax = v[i];
+            idx_of_max = i;
+        }
+    }
+    return std::pair(idx_of_min, idx_of_max);
+}
+
+template <>
+std::pair<size_t, size_t> minmax_element<float>(const float *v, size_t size)
+{
+    return minmax_element_with_nan<float>(v, size);
+}
+
+template <>
+std::pair<size_t, size_t> minmax_element<double>(const double *v, size_t size)
+{
+    return minmax_element_with_nan<double>(v, size);
+}
+
+template <class T>
+inline std::pair<size_t, size_t> minmax_element(const T *buffer, size_t size,
+                                                bool bHasNoData, T noDataValue)
+{
+    if (bHasNoData)
+    {
+        return minmax_element<T>(buffer, size, noDataValue);
+    }
+    else
+    {
+        return minmax_element<T>(buffer, size);
+    }
+}
+#else
+
+/************************************************************************/
+/*                         minmax_element()                             */
+/************************************************************************/
+
+template <class T>
+inline std::pair<size_t, size_t> minmax_element(const T *buffer, size_t size,
+                                                bool bHasNoData, T noDataValue)
+{
+#ifdef NOT_EFFICIENT
+    if (bHasNoData)
+    {
+        return minmax_element<T>(buffer, size, noDataValue);
+    }
+    else
+    {
+        return minmax_element<T>(buffer, size);
+        //auto [imin, imax] = std::minmax_element(buffer, buffer + size);
+        //return std::pair(imin - buffer, imax - buffer);
+    }
+#else
+    // Using separately min and max is more efficient than computing them
+    // within the same loop
+    return std::pair(
+        extremum_element<T, false>(buffer, size, bHasNoData, noDataValue),
+        extremum_element<T, true>(buffer, size, bHasNoData, noDataValue));
+#endif
+}
+#endif
+
+}  // namespace detail
+
+/************************************************************************/
+/*                          minmax_element()                            */
+/************************************************************************/
+
+/** Return the index of the elements where the minimum and maximum values are hit.
+ *
+ * If they are hit in several locations, it is not specified which one will be
+ * returned (contrary to std::minmax_element).
+ *
+ * @param buffer Vector of nElts elements of type eDT.
+ * @param nElts Number of elements in buffer.
+ * @param eDT Data type of the elements of buffer.
+ * @param bHasNoData Whether dfNoDataValue is valid.
+ * @param dfNoDataValue Nodata value, only taken into account if bHasNoData == true
+ *
+ * @since GDAL 3.11
+ */
+inline std::pair<size_t, size_t> minmax_element(const void *buffer,
+                                                size_t nElts, GDALDataType eDT,
+                                                bool bHasNoData,
+                                                double dfNoDataValue)
+{
+    switch (eDT)
+    {
+        case GDT_Int8:
+        {
+            using T = int8_t;
+            bHasNoData = bHasNoData && GDALIsValueExactAs<T>(dfNoDataValue);
+            return detail::minmax_element<T>(
+                static_cast<const T *>(buffer), nElts, bHasNoData,
+                bHasNoData ? static_cast<T>(dfNoDataValue) : 0);
+        }
+        case GDT_Byte:
+        {
+            using T = uint8_t;
+            bHasNoData = bHasNoData && GDALIsValueExactAs<T>(dfNoDataValue);
+            return detail::minmax_element<T>(
+                static_cast<const T *>(buffer), nElts, bHasNoData,
+                bHasNoData ? static_cast<T>(dfNoDataValue) : 0);
+        }
+        case GDT_Int16:
+        {
+            using T = int16_t;
+            bHasNoData = bHasNoData && GDALIsValueExactAs<T>(dfNoDataValue);
+            return detail::minmax_element<T>(
+                static_cast<const T *>(buffer), nElts, bHasNoData,
+                bHasNoData ? static_cast<T>(dfNoDataValue) : 0);
+        }
+        case GDT_UInt16:
+        {
+            using T = uint16_t;
+            bHasNoData = bHasNoData && GDALIsValueExactAs<T>(dfNoDataValue);
+            return detail::minmax_element<T>(
+                static_cast<const T *>(buffer), nElts, bHasNoData,
+                bHasNoData ? static_cast<T>(dfNoDataValue) : 0);
+        }
+        case GDT_Int32:
+        {
+            using T = int32_t;
+            bHasNoData = bHasNoData && GDALIsValueExactAs<T>(dfNoDataValue);
+            return detail::minmax_element<T>(
+                static_cast<const T *>(buffer), nElts, bHasNoData,
+                bHasNoData ? static_cast<T>(dfNoDataValue) : 0);
+        }
+        case GDT_UInt32:
+        {
+            using T = uint32_t;
+            bHasNoData = bHasNoData && GDALIsValueExactAs<T>(dfNoDataValue);
+            return detail::minmax_element<T>(
+                static_cast<const T *>(buffer), nElts, bHasNoData,
+                bHasNoData ? static_cast<T>(dfNoDataValue) : 0);
+        }
+        case GDT_Int64:
+        {
+            using T = int64_t;
+            bHasNoData = bHasNoData && GDALIsValueExactAs<T>(dfNoDataValue);
+            return detail::minmax_element<T>(
+                static_cast<const T *>(buffer), nElts, bHasNoData,
+                bHasNoData ? static_cast<T>(dfNoDataValue) : 0);
+        }
+        case GDT_UInt64:
+        {
+            using T = uint64_t;
+            bHasNoData = bHasNoData && GDALIsValueExactAs<T>(dfNoDataValue);
+            return detail::minmax_element<T>(
+                static_cast<const T *>(buffer), nElts, bHasNoData,
+                bHasNoData ? static_cast<T>(dfNoDataValue) : 0);
+        }
+        case GDT_Float32:
+        {
+            using T = float;
+            bHasNoData = bHasNoData && GDALIsValueExactAs<T>(dfNoDataValue);
+            return detail::minmax_element<T>(
+                static_cast<const T *>(buffer), nElts, bHasNoData,
+                bHasNoData ? static_cast<T>(dfNoDataValue) : 0);
+        }
+        case GDT_Float64:
+        {
+            using T = double;
+            bHasNoData = bHasNoData && GDALIsValueExactAs<T>(dfNoDataValue);
+            return detail::minmax_element<T>(
+                static_cast<const T *>(buffer), nElts, bHasNoData,
+                bHasNoData ? static_cast<T>(dfNoDataValue) : 0);
+        }
+        default:
+            break;
+    }
+    CPLError(CE_Failure, CPLE_NotSupported,
+             "%s not supported for this data type.", __FUNCTION__);
+    return std::pair(0, 0);
+}
+
+}  // namespace GDAL_MINMAXELT_NS
+
+#endif  // GDAL_MINMAX_ELEMENT_INCLUDED
diff --git a/perftests/CMakeLists.txt b/perftests/CMakeLists.txt
index 4f365f34a326..e020f471d2b9 100644
--- a/perftests/CMakeLists.txt
+++ b/perftests/CMakeLists.txt
@@ -10,3 +10,7 @@ target_link_libraries(bench_ogr_batch PRIVATE $<TARGET_NAME:${GDAL_LIB_TARGET_NA
 add_executable(bench_ogr_c_api bench_ogr_c_api.cpp)
 gdal_standard_includes(bench_ogr_c_api)
 target_link_libraries(bench_ogr_c_api PRIVATE $<TARGET_NAME:${GDAL_LIB_TARGET_NAME}>)
+
+add_executable(testperf_gdal_minmax_element testperf_gdal_minmax_element.cpp)
+gdal_standard_includes(testperf_gdal_minmax_element)
+target_link_libraries(testperf_gdal_minmax_element PRIVATE $<TARGET_NAME:${GDAL_LIB_TARGET_NAME}>)
diff --git a/perftests/testperf_gdal_minmax_element.cpp b/perftests/testperf_gdal_minmax_element.cpp
new file mode 100644
index 000000000000..77968e11f850
--- /dev/null
+++ b/perftests/testperf_gdal_minmax_element.cpp
@@ -0,0 +1,534 @@
+/******************************************************************************
+ * Project:  GDAL Core
+ * Purpose:  Test performance of gdal_minmax_element.hpp
+ * Author:   Even Rouault, <even dot rouault at spatialys.com>
+ *
+ ******************************************************************************
+ * Copyright (c) 2024, Even Rouault <even dot rouault at spatialys.com>
+ *
+ * SPDX-License-Identifier: MIT
+ ****************************************************************************/
+
+#include "gdal_minmax_element.hpp"
+
+#include <chrono>
+#include <random>
+
+template <class T> void randomFill(T *v, size_t size, bool withNaN = true)
+{
+    std::random_device rd;
+    std::mt19937 gen{rd()};
+    std::normal_distribution<> dist{127, 30};
+    for (size_t i = 0; i < size; i++)
+    {
+        v[i] = static_cast<T>(dist(gen));
+        if constexpr (std::is_same<T, float>::value ||
+                      std::is_same<T, double>::value)
+        {
+            if (withNaN && (i == 0 || (i > 10 && ((i + 1) % 1024) <= 4)))
+                v[i] = std::numeric_limits<float>::quiet_NaN();
+        }
+    }
+}
+
+int main(int /* argc */, char * /* argv */[])
+{
+    constexpr size_t SIZE = 10 * 1000 * 1000 + 1;
+    constexpr int N_ITERS = 1;
+    {
+        using T = uint8_t;
+        constexpr GDALDataType eDT = GDT_Byte;
+        printf("uint8:\n");
+        std::vector<T> x;
+        x.resize(SIZE);
+        randomFill(x.data(), x.size());
+        {
+            auto start = std::chrono::steady_clock::now();
+            int idx = 0;
+            for (int i = 0; i < N_ITERS; ++i)
+            {
+                idx += static_cast<int>(
+                    gdal::min_element(x.data(), x.size(), eDT, false, 0));
+            }
+            idx /= N_ITERS;
+            printf("min at idx %d\n", idx);
+            auto end = std::chrono::steady_clock::now();
+            printf("-> elapsed=%d\n", static_cast<int>((end - start).count()));
+        }
+        {
+            auto start = std::chrono::steady_clock::now();
+            int idx = 0;
+            for (int i = 0; i < N_ITERS; ++i)
+            {
+                idx += static_cast<int>(std::distance(
+                    x.begin(), std::min_element(x.begin(), x.end())));
+            }
+            idx /= N_ITERS;
+            printf("min at idx %d (using std::min_element)\n", idx);
+            auto end = std::chrono::steady_clock::now();
+            printf("-> elapsed=%d\n", static_cast<int>((end - start).count()));
+        }
+        {
+            auto start = std::chrono::steady_clock::now();
+            int idx = 0;
+            for (int i = 0; i < N_ITERS; ++i)
+            {
+                idx += static_cast<int>(
+                    gdal::min_element(x.data(), x.size(), eDT, true, 0));
+            }
+            idx /= N_ITERS;
+            printf("min at idx %d(nodata case)\n", idx);
+            auto end = std::chrono::steady_clock::now();
+            printf("-> elapsed=%d\n", static_cast<int>((end - start).count()));
+        }
+    }
+    printf("--------------------\n");
+    {
+        using T = int8_t;
+        constexpr GDALDataType eDT = GDT_Int8;
+        printf("int8:\n");
+        std::vector<T> x;
+        x.resize(SIZE);
+        randomFill(x.data(), x.size());
+        {
+            auto start = std::chrono::steady_clock::now();
+            int idx = 0;
+            for (int i = 0; i < N_ITERS; ++i)
+            {
+                idx += static_cast<int>(
+                    gdal::min_element(x.data(), x.size(), eDT, false, 0));
+            }
+            idx /= N_ITERS;
+            printf("min at idx %d\n", idx);
+            auto end = std::chrono::steady_clock::now();
+            printf("-> elapsed=%d\n", static_cast<int>((end - start).count()));
+        }
+        {
+            auto start = std::chrono::steady_clock::now();
+            int idx = 0;
+            for (int i = 0; i < N_ITERS; ++i)
+            {
+                idx += static_cast<int>(std::distance(
+                    x.begin(), std::min_element(x.begin(), x.end())));
+            }
+            idx /= N_ITERS;
+            printf("min at idx %d (using std::min_element)\n", idx);
+            auto end = std::chrono::steady_clock::now();
+            printf("-> elapsed=%d\n", static_cast<int>((end - start).count()));
+        }
+        {
+            auto start = std::chrono::steady_clock::now();
+            int idx = 0;
+            for (int i = 0; i < N_ITERS; ++i)
+            {
+                idx += static_cast<int>(
+                    gdal::min_element(x.data(), x.size(), eDT, true, 0));
+            }
+            idx /= N_ITERS;
+            printf("min at idx %d(nodata case)\n", idx);
+            auto end = std::chrono::steady_clock::now();
+            printf("-> elapsed=%d\n", static_cast<int>((end - start).count()));
+        }
+    }
+    printf("--------------------\n");
+    {
+        using T = uint16_t;
+        constexpr GDALDataType eDT = GDT_UInt16;
+        printf("uint16:\n");
+        std::vector<T> x;
+        x.resize(SIZE);
+        randomFill(x.data(), x.size());
+        {
+            auto start = std::chrono::steady_clock::now();
+            int idx = 0;
+            for (int i = 0; i < N_ITERS; ++i)
+            {
+                idx += static_cast<int>(
+                    gdal::min_element(x.data(), x.size(), eDT, false, 0));
+            }
+            idx /= N_ITERS;
+            printf("min at idx %d\n", idx);
+            auto end = std::chrono::steady_clock::now();
+            printf("-> elapsed=%d\n", static_cast<int>((end - start).count()));
+        }
+        {
+            auto start = std::chrono::steady_clock::now();
+            int idx = 0;
+            for (int i = 0; i < N_ITERS; ++i)
+            {
+                idx += static_cast<int>(std::distance(
+                    x.begin(), std::min_element(x.begin(), x.end())));
+            }
+            idx /= N_ITERS;
+            printf("min at idx %d (using std::min_element)\n", idx);
+            auto end = std::chrono::steady_clock::now();
+            printf("-> elapsed=%d\n", static_cast<int>((end - start).count()));
+        }
+        {
+            auto start = std::chrono::steady_clock::now();
+            int idx = 0;
+            for (int i = 0; i < N_ITERS; ++i)
+            {
+                idx += static_cast<int>(
+                    gdal::min_element(x.data(), x.size(), eDT, true, 0));
+            }
+            idx /= N_ITERS;
+            printf("min at idx %d(nodata case)\n", idx);
+            auto end = std::chrono::steady_clock::now();
+            printf("-> elapsed=%d\n", static_cast<int>((end - start).count()));
+        }
+    }
+    printf("--------------------\n");
+    {
+        using T = int16_t;
+        constexpr GDALDataType eDT = GDT_Int16;
+        printf("int16:\n");
+        std::vector<T> x;
+        x.resize(SIZE);
+        randomFill(x.data(), x.size());
+        {
+            auto start = std::chrono::steady_clock::now();
+            int idx = 0;
+            for (int i = 0; i < N_ITERS; ++i)
+            {
+                idx += static_cast<int>(
+                    gdal::min_element(x.data(), x.size(), eDT, false, 0));
+            }
+            idx /= N_ITERS;
+            printf("min at idx %d\n", idx);
+            auto end = std::chrono::steady_clock::now();
+            printf("-> elapsed=%d\n", static_cast<int>((end - start).count()));
+        }
+        {
+            auto start = std::chrono::steady_clock::now();
+            int idx = 0;
+            for (int i = 0; i < N_ITERS; ++i)
+            {
+                idx += static_cast<int>(std::distance(
+                    x.begin(), std::min_element(x.begin(), x.end())));
+            }
+            idx /= N_ITERS;
+            printf("min at idx %d (using std::min_element)\n", idx);
+            auto end = std::chrono::steady_clock::now();
+            printf("-> elapsed=%d\n", static_cast<int>((end - start).count()));
+        }
+        {
+            auto start = std::chrono::steady_clock::now();
+            int idx = 0;
+            for (int i = 0; i < N_ITERS; ++i)
+            {
+                idx += static_cast<int>(
+                    gdal::min_element(x.data(), x.size(), eDT, true, 0));
+            }
+            idx /= N_ITERS;
+            printf("min at idx %d(nodata case)\n", idx);
+            auto end = std::chrono::steady_clock::now();
+            printf("-> elapsed=%d\n", static_cast<int>((end - start).count()));
+        }
+    }
+    printf("--------------------\n");
+    {
+        using T = uint32_t;
+        constexpr GDALDataType eDT = GDT_UInt32;
+        printf("uint32:\n");
+        std::vector<T> x;
+        x.resize(SIZE);
+        randomFill(x.data(), x.size());
+        {
+            auto start = std::chrono::steady_clock::now();
+            int idx = 0;
+            for (int i = 0; i < N_ITERS; ++i)
+            {
+                idx += static_cast<int>(
+                    gdal::min_element(x.data(), x.size(), eDT, false, 0));
+            }
+            idx /= N_ITERS;
+            printf("min at idx %d\n", idx);
+            auto end = std::chrono::steady_clock::now();
+            printf("-> elapsed=%d\n", static_cast<int>((end - start).count()));
+        }
+        {
+            auto start = std::chrono::steady_clock::now();
+            int idx = 0;
+            for (int i = 0; i < N_ITERS; ++i)
+            {
+                idx += static_cast<int>(std::distance(
+                    x.begin(), std::min_element(x.begin(), x.end())));
+            }
+            idx /= N_ITERS;
+            printf("min at idx %d (using std::min_element)\n", idx);
+            auto end = std::chrono::steady_clock::now();
+            printf("-> elapsed=%d\n", static_cast<int>((end - start).count()));
+        }
+        {
+            auto start = std::chrono::steady_clock::now();
+            int idx = 0;
+            for (int i = 0; i < N_ITERS; ++i)
+            {
+                idx += static_cast<int>(
+                    gdal::min_element(x.data(), x.size(), eDT, true, 0));
+            }
+            idx /= N_ITERS;
+            printf("min at idx %d(nodata case)\n", idx);
+            auto end = std::chrono::steady_clock::now();
+            printf("-> elapsed=%d\n", static_cast<int>((end - start).count()));
+        }
+    }
+    printf("--------------------\n");
+    {
+        using T = int32_t;
+        constexpr GDALDataType eDT = GDT_Int32;
+        printf("int32:\n");
+        std::vector<T> x;
+        x.resize(SIZE);
+        randomFill(x.data(), x.size());
+        {
+            auto start = std::chrono::steady_clock::now();
+            int idx = 0;
+            for (int i = 0; i < N_ITERS; ++i)
+            {
+                idx += static_cast<int>(
+                    gdal::min_element(x.data(), x.size(), eDT, false, 0));
+            }
+            idx /= N_ITERS;
+            printf("min at idx %d\n", idx);
+            auto end = std::chrono::steady_clock::now();
+            printf("-> elapsed=%d\n", static_cast<int>((end - start).count()));
+        }
+        {
+            auto start = std::chrono::steady_clock::now();
+            int idx = 0;
+            for (int i = 0; i < N_ITERS; ++i)
+            {
+                idx += static_cast<int>(std::distance(
+                    x.begin(), std::min_element(x.begin(), x.end())));
+            }
+            idx /= N_ITERS;
+            printf("min at idx %d (using std::min_element)\n", idx);
+            auto end = std::chrono::steady_clock::now();
+            printf("-> elapsed=%d\n", static_cast<int>((end - start).count()));
+        }
+        {
+            auto start = std::chrono::steady_clock::now();
+            int idx = 0;
+            for (int i = 0; i < N_ITERS; ++i)
+            {
+                idx += static_cast<int>(
+                    gdal::min_element(x.data(), x.size(), eDT, true, 0));
+            }
+            idx /= N_ITERS;
+            printf("min at idx %d(nodata case)\n", idx);
+            auto end = std::chrono::steady_clock::now();
+            printf("-> elapsed=%d\n", static_cast<int>((end - start).count()));
+        }
+    }
+    printf("--------------------\n");
+    {
+        using T = float;
+        constexpr GDALDataType eDT = GDT_Float32;
+        printf("float:\n");
+        std::vector<T> x;
+        x.resize(SIZE);
+        randomFill(x.data(), x.size());
+        {
+            auto start = std::chrono::steady_clock::now();
+            int idx = 0;
+            for (int i = 0; i < N_ITERS; ++i)
+            {
+                idx += static_cast<int>(
+                    gdal::min_element(x.data(), x.size(), eDT, false, 0));
+            }
+            idx /= N_ITERS;
+            printf("min at idx %d\n", idx);
+            auto end = std::chrono::steady_clock::now();
+            printf("-> elapsed=%d\n", static_cast<int>((end - start).count()));
+        }
+        {
+            auto start = std::chrono::steady_clock::now();
+            int idx = 0;
+            for (int i = 0; i < N_ITERS; ++i)
+            {
+                idx += static_cast<int>(std::distance(
+                    x.begin(), std::min_element(x.begin(), x.end(),
+                                                [](float x, float y) {
+                                                    return std::isnan(y) ? true
+                                                           : std::isnan(x)
+                                                               ? false
+                                                               : x < y;
+                                                })));
+            }
+            idx /= N_ITERS;
+            printf("min at idx %d (using std::min_element with NaN aware "
+                   "comparison)\n",
+                   idx);
+            auto end = std::chrono::steady_clock::now();
+            printf("-> elapsed=%d\n", static_cast<int>((end - start).count()));
+        }
+        {
+            auto start = std::chrono::steady_clock::now();
+            int idx = 0;
+            for (int i = 0; i < N_ITERS; ++i)
+            {
+                idx += static_cast<int>(
+                    gdal::min_element(x.data(), x.size(), eDT, true, 0));
+            }
+            idx /= N_ITERS;
+            printf("min at idx %d(nodata case)\n", idx);
+            auto end = std::chrono::steady_clock::now();
+            printf("-> elapsed=%d\n", static_cast<int>((end - start).count()));
+        }
+    }
+    printf("--------------------\n");
+    {
+        using T = float;
+        constexpr GDALDataType eDT = GDT_Float32;
+        printf("float (without NaN):\n");
+        std::vector<T> x;
+        x.resize(SIZE);
+        randomFill(x.data(), x.size(), false);
+        {
+            auto start = std::chrono::steady_clock::now();
+            int idx = 0;
+            for (int i = 0; i < N_ITERS; ++i)
+            {
+                idx += static_cast<int>(
+                    gdal::min_element(x.data(), x.size(), eDT, false, 0));
+            }
+            idx /= N_ITERS;
+            printf("min at idx %d\n", idx);
+            auto end = std::chrono::steady_clock::now();
+            printf("-> elapsed=%d\n", static_cast<int>((end - start).count()));
+        }
+        {
+            auto start = std::chrono::steady_clock::now();
+            int idx = 0;
+            for (int i = 0; i < N_ITERS; ++i)
+            {
+                idx += static_cast<int>(std::distance(
+                    x.begin(), std::min_element(x.begin(), x.end())));
+            }
+            idx /= N_ITERS;
+            printf("min at idx %d (using std::min_element)\n", idx);
+            auto end = std::chrono::steady_clock::now();
+            printf("-> elapsed=%d\n", static_cast<int>((end - start).count()));
+        }
+        {
+            auto start = std::chrono::steady_clock::now();
+            int idx = 0;
+            for (int i = 0; i < N_ITERS; ++i)
+            {
+                idx += static_cast<int>(
+                    gdal::min_element(x.data(), x.size(), eDT, true, 0));
+            }
+            idx /= N_ITERS;
+            printf("min at idx %d(nodata case)\n", idx);
+            auto end = std::chrono::steady_clock::now();
+            printf("-> elapsed=%d\n", static_cast<int>((end - start).count()));
+        }
+    }
+    printf("--------------------\n");
+    {
+        using T = double;
+        constexpr GDALDataType eDT = GDT_Float64;
+        printf("double:\n");
+        std::vector<T> x;
+        x.resize(SIZE);
+        randomFill(x.data(), x.size());
+        {
+            auto start = std::chrono::steady_clock::now();
+            int idx = 0;
+            for (int i = 0; i < N_ITERS; ++i)
+            {
+                idx += static_cast<int>(
+                    gdal::min_element(x.data(), x.size(), eDT, false, 0));
+            }
+            idx /= N_ITERS;
+            printf("min at idx %d\n", idx);
+            auto end = std::chrono::steady_clock::now();
+            printf("-> elapsed=%d\n", static_cast<int>((end - start).count()));
+        }
+        {
+            auto start = std::chrono::steady_clock::now();
+            int idx = 0;
+            for (int i = 0; i < N_ITERS; ++i)
+            {
+                idx += static_cast<int>(std::distance(
+                    x.begin(), std::min_element(x.begin(), x.end(),
+                                                [](double x, double y) {
+                                                    return std::isnan(y) ? true
+                                                           : std::isnan(x)
+                                                               ? false
+                                                               : x < y;
+                                                })));
+            }
+            idx /= N_ITERS;
+            printf("min at idx %d (using std::min_element with NaN aware "
+                   "comparison)\n",
+                   idx);
+            auto end = std::chrono::steady_clock::now();
+            printf("-> elapsed=%d\n", static_cast<int>((end - start).count()));
+        }
+        {
+            auto start = std::chrono::steady_clock::now();
+            int idx = 0;
+            for (int i = 0; i < N_ITERS; ++i)
+            {
+                idx += static_cast<int>(
+                    gdal::min_element(x.data(), x.size(), eDT, true, 0));
+            }
+            idx /= N_ITERS;
+            printf("min at idx %d(nodata case)\n", idx);
+            auto end = std::chrono::steady_clock::now();
+            printf("-> elapsed=%d\n", static_cast<int>((end - start).count()));
+        }
+    }
+    printf("--------------------\n");
+    {
+        using T = double;
+        constexpr GDALDataType eDT = GDT_Float64;
+        printf("double (without NaN):\n");
+        std::vector<T> x;
+        x.resize(SIZE);
+        randomFill(x.data(), x.size(), false);
+        {
+            auto start = std::chrono::steady_clock::now();
+            int idx = 0;
+            for (int i = 0; i < N_ITERS; ++i)
+            {
+                idx += static_cast<int>(
+                    gdal::min_element(x.data(), x.size(), eDT, false, 0));
+            }
+            idx /= N_ITERS;
+            printf("min at idx %d\n", idx);
+            auto end = std::chrono::steady_clock::now();
+            printf("-> elapsed=%d\n", static_cast<int>((end - start).count()));
+        }
+        {
+            auto start = std::chrono::steady_clock::now();
+            int idx = 0;
+            for (int i = 0; i < N_ITERS; ++i)
+            {
+                idx += static_cast<int>(std::distance(
+                    x.begin(), std::min_element(x.begin(), x.end())));
+            }
+            idx /= N_ITERS;
+            printf("min at idx %d (using std::min_element)\n", idx);
+            auto end = std::chrono::steady_clock::now();
+            printf("-> elapsed=%d\n", static_cast<int>((end - start).count()));
+        }
+        {
+            auto start = std::chrono::steady_clock::now();
+            int idx = 0;
+            for (int i = 0; i < N_ITERS; ++i)
+            {
+                idx += static_cast<int>(
+                    gdal::min_element(x.data(), x.size(), eDT, true, 0));
+            }
+            idx /= N_ITERS;
+            printf("min at idx %d(nodata case)\n", idx);
+            auto end = std::chrono::steady_clock::now();
+            printf("-> elapsed=%d\n", static_cast<int>((end - start).count()));
+        }
+    }
+    return 0;
+}

From d84849644feddb8440e59d0cf90d559934e9d799 Mon Sep 17 00:00:00 2001
From: Even Rouault <even.rouault@spatialys.com>
Date: Sun, 3 Nov 2024 16:33:26 +0100
Subject: [PATCH 12/62] gdal_minmax_element.hpp: add SSE2 optimization for
 uint8, int8, uint16, int16, uint32, int32, float and double nodata cases

---
 autotest/cpp/test_gdal_minmax_element.cpp  | 102 ++++++
 gcore/gdal_minmax_element.hpp              | 386 ++++++++++++++++++---
 perftests/testperf_gdal_minmax_element.cpp | 252 ++++++++++++--
 3 files changed, 678 insertions(+), 62 deletions(-)

diff --git a/autotest/cpp/test_gdal_minmax_element.cpp b/autotest/cpp/test_gdal_minmax_element.cpp
index 0d3216ca9b50..f616fc032e45 100644
--- a/autotest/cpp/test_gdal_minmax_element.cpp
+++ b/autotest/cpp/test_gdal_minmax_element.cpp
@@ -91,6 +91,26 @@ TEST_F(test_gdal_minmax_element, uint8)
         auto idx_max = gdal::max_element(v.data(), v.size(), eDT, false, 0);
         EXPECT_EQ(v[idx_max], max_v);
     }
+    {
+        std::vector<T> v(257, static_cast<T>(min_v + 2));
+        v[128] = static_cast<T>(min_v + 1);
+        v[256] = min_v;
+        auto idx_min = gdal::min_element(v.data(), v.size(), eDT, true,
+                                         static_cast<T>(min_v + 1));
+        EXPECT_EQ(v[idx_min], min_v);
+    }
+    {
+        std::vector<T> v(257, 0);
+        auto idx_min = gdal::min_element(v.data(), v.size(), eDT, true, 0);
+        EXPECT_EQ(idx_min, 0);
+    }
+    {
+        std::vector<T> v(257, 0);
+        v[127] = static_cast<T>(min_v + 1);
+        v[255] = min_v;
+        auto idx_min = gdal::min_element(v.data(), v.size(), eDT, true, 0);
+        EXPECT_EQ(v[idx_min], min_v);
+    }
     {
         std::vector<T> v(259, static_cast<T>((min_v + max_v) / 2));
         v[0] = min_v;
@@ -156,6 +176,14 @@ TEST_F(test_gdal_minmax_element, uint8)
         auto idx_max = gdal::max_element(v.data(), v.size(), eDT, false, 0);
         EXPECT_EQ(v[idx_max], max_v);
     }
+    {
+        std::vector<T> v(257, 0);
+        v[65] = static_cast<T>(max_v - 2);
+        v[66] = static_cast<T>(max_v - 1);
+        v[129] = max_v;
+        auto idx_max = gdal::max_element(v.data(), v.size(), eDT, true, 0);
+        EXPECT_EQ(v[idx_max], max_v);
+    }
 }
 
 TEST_F(test_gdal_minmax_element, int8)
@@ -214,6 +242,14 @@ TEST_F(test_gdal_minmax_element, int8)
         auto idx_max = gdal::max_element(v.data(), v.size(), eDT, false, 0);
         EXPECT_EQ(v[idx_max], max_v);
     }
+    {
+        std::vector<T> v(257, static_cast<T>(min_v + 2));
+        v[128] = static_cast<T>(min_v + 1);
+        v[256] = min_v;
+        auto idx_min = gdal::min_element(v.data(), v.size(), eDT, true,
+                                         static_cast<T>(min_v + 1));
+        EXPECT_EQ(v[idx_min], min_v);
+    }
 }
 
 TEST_F(test_gdal_minmax_element, uint16)
@@ -272,6 +308,14 @@ TEST_F(test_gdal_minmax_element, uint16)
         auto idx_max = gdal::max_element(v.data(), v.size(), eDT, false, 0);
         EXPECT_EQ(v[idx_max], max_v);
     }
+    {
+        std::vector<T> v(257, static_cast<T>(min_v + 2));
+        v[128] = static_cast<T>(min_v + 1);
+        v[256] = min_v;
+        auto idx_min = gdal::min_element(v.data(), v.size(), eDT, true,
+                                         static_cast<T>(min_v + 1));
+        EXPECT_EQ(v[idx_min], min_v);
+    }
 }
 
 TEST_F(test_gdal_minmax_element, int16)
@@ -330,6 +374,14 @@ TEST_F(test_gdal_minmax_element, int16)
         auto idx_max = gdal::max_element(v.data(), v.size(), eDT, false, 0);
         EXPECT_EQ(v[idx_max], max_v);
     }
+    {
+        std::vector<T> v(257, static_cast<T>(min_v + 2));
+        v[128] = static_cast<T>(min_v + 1);
+        v[256] = min_v;
+        auto idx_min = gdal::min_element(v.data(), v.size(), eDT, true,
+                                         static_cast<T>(min_v + 1));
+        EXPECT_EQ(v[idx_min], min_v);
+    }
 }
 
 TEST_F(test_gdal_minmax_element, uint32)
@@ -388,6 +440,14 @@ TEST_F(test_gdal_minmax_element, uint32)
         auto idx_max = gdal::max_element(v.data(), v.size(), eDT, false, 0);
         EXPECT_EQ(v[idx_max], max_v);
     }
+    {
+        std::vector<T> v(257, static_cast<T>(min_v + 2));
+        v[128] = static_cast<T>(min_v + 1);
+        v[256] = min_v;
+        auto idx_min = gdal::min_element(v.data(), v.size(), eDT, true,
+                                         static_cast<T>(min_v + 1));
+        EXPECT_EQ(v[idx_min], min_v);
+    }
 }
 
 TEST_F(test_gdal_minmax_element, int32)
@@ -446,6 +506,14 @@ TEST_F(test_gdal_minmax_element, int32)
         auto idx_max = gdal::max_element(v.data(), v.size(), eDT, false, 0);
         EXPECT_EQ(v[idx_max], max_v);
     }
+    {
+        std::vector<T> v(257, static_cast<T>(min_v + 2));
+        v[128] = static_cast<T>(min_v + 1);
+        v[256] = min_v;
+        auto idx_min = gdal::min_element(v.data(), v.size(), eDT, true,
+                                         static_cast<T>(min_v + 1));
+        EXPECT_EQ(v[idx_min], min_v);
+    }
 }
 
 TEST_F(test_gdal_minmax_element, uint64)
@@ -504,6 +572,15 @@ TEST_F(test_gdal_minmax_element, uint64)
         auto idx_max = gdal::max_element(v.data(), v.size(), eDT, false, 0);
         EXPECT_EQ(v[idx_max], max_v);
     }
+    {
+        std::vector<T> v(257, static_cast<T>(min_v + 2));
+        v[128] = static_cast<T>(min_v + 1);
+        v[256] = min_v;
+        auto idx_min =
+            gdal::min_element(v.data(), v.size(), eDT, true,
+                              static_cast<double>(static_cast<T>(min_v + 1)));
+        EXPECT_EQ(v[idx_min], min_v);
+    }
 }
 
 TEST_F(test_gdal_minmax_element, int64)
@@ -568,6 +645,15 @@ TEST_F(test_gdal_minmax_element, int64)
         auto idx_max = gdal::max_element(v.data(), v.size(), eDT, false, 0);
         EXPECT_EQ(v[idx_max], max_v);
     }
+    {
+        std::vector<T> v(257, static_cast<T>(min_v + 2));
+        v[128] = static_cast<T>(min_v + 1);
+        v[256] = min_v;
+        auto idx_min =
+            gdal::min_element(v.data(), v.size(), eDT, true,
+                              static_cast<double>(static_cast<T>(min_v + 1)));
+        EXPECT_EQ(v[idx_min], min_v);
+    }
 }
 
 TEST_F(test_gdal_minmax_element, float32)
@@ -680,6 +766,14 @@ TEST_F(test_gdal_minmax_element, float32)
         auto idx_max = gdal::max_element(v.data(), v.size(), eDT, false, 0);
         EXPECT_EQ(v[idx_max], max_v);
     }
+    {
+        std::vector<T> v(257, static_cast<T>(min_v + 0.2f));
+        v[128] = static_cast<T>(min_v + 0.1f);
+        v[256] = min_v;
+        auto idx_min = gdal::min_element(v.data(), v.size(), eDT, true,
+                                         static_cast<T>(min_v + 0.1f));
+        EXPECT_EQ(v[idx_min], min_v);
+    }
 }
 
 TEST_F(test_gdal_minmax_element, float64)
@@ -769,6 +863,14 @@ TEST_F(test_gdal_minmax_element, float64)
         auto idx_max = gdal::max_element(v.data(), v.size(), eDT, false, 0);
         EXPECT_EQ(v[idx_max], max_v);
     }
+    {
+        std::vector<T> v(257, static_cast<T>(min_v + 0.2));
+        v[128] = static_cast<T>(min_v + 0.1);
+        v[256] = min_v;
+        auto idx_min = gdal::min_element(v.data(), v.size(), eDT, true,
+                                         static_cast<T>(min_v + 0.1));
+        EXPECT_EQ(v[idx_min], min_v);
+    }
 }
 
 TEST_F(test_gdal_minmax_element, unsupported)
diff --git a/gcore/gdal_minmax_element.hpp b/gcore/gdal_minmax_element.hpp
index cf4e6a93aafc..62b3dee7e9da 100644
--- a/gcore/gdal_minmax_element.hpp
+++ b/gcore/gdal_minmax_element.hpp
@@ -34,6 +34,10 @@
 #endif
 
 #if defined(__x86_64) || defined(_M_X64)
+#define GDAL_MINMAX_ELEMENT_USE_SSE2
+#endif
+
+#ifdef GDAL_MINMAX_ELEMENT_USE_SSE2
 // SSE2 header
 #include <emmintrin.h>
 #endif
@@ -130,10 +134,10 @@ template <class T, bool IS_MAX> size_t extremum_element(const T *v, size_t size)
     return idx_of_extremum;
 }
 
-#if defined(__x86_64) || defined(_M_X64)
+#ifdef GDAL_MINMAX_ELEMENT_USE_SSE2
 
 /************************************************************************/
-/*                    extremum_element_with_nan()                       */
+/*                   extremum_element_with_nan()                        */
 /************************************************************************/
 
 static inline int8_t Shift8(uint8_t x)
@@ -176,6 +180,39 @@ template <class T> static inline auto set1(T x)
         return _mm_set1_pd(x);
 }
 
+// Return a _mm128[i|d] register with all its elements set to x
+template <class T> static inline auto set1_unshifted(T x)
+{
+    if constexpr (std::is_same_v<T, uint8_t>)
+    {
+        int8_t xSigned;
+        memcpy(&xSigned, &x, sizeof(xSigned));
+        return _mm_set1_epi8(xSigned);
+    }
+    else if constexpr (std::is_same_v<T, int8_t>)
+        return _mm_set1_epi8(x);
+    else if constexpr (std::is_same_v<T, uint16_t>)
+    {
+        int16_t xSigned;
+        memcpy(&xSigned, &x, sizeof(xSigned));
+        return _mm_set1_epi16(xSigned);
+    }
+    else if constexpr (std::is_same_v<T, int16_t>)
+        return _mm_set1_epi16(x);
+    else if constexpr (std::is_same_v<T, uint32_t>)
+    {
+        int32_t xSigned;
+        memcpy(&xSigned, &x, sizeof(xSigned));
+        return _mm_set1_epi32(xSigned);
+    }
+    else if constexpr (std::is_same_v<T, int32_t>)
+        return _mm_set1_epi32(x);
+    else if constexpr (std::is_same_v<T, float>)
+        return _mm_set1_ps(x);
+    else
+        return _mm_set1_pd(x);
+}
+
 // Load as many values of type T at a _mm128[i|d] register can contain from x
 template <class T> static inline auto loadv(const T *x)
 {
@@ -253,8 +290,8 @@ static inline __m128i comp(SSE_T x, SSE_T y)
 }
 
 // Using SSE2
-template <class T, bool IS_MAX>
-inline size_t extremum_element_with_nan(const T *v, size_t size)
+template <class T, bool IS_MAX, bool HAS_NODATA>
+inline size_t extremum_element_with_nan(const T *v, size_t size, T noDataValue)
 {
     static_assert(std::is_same_v<T, uint8_t> || std::is_same_v<T, int8_t> ||
                   std::is_same_v<T, uint16_t> || std::is_same_v<T, int16_t> ||
@@ -264,7 +301,16 @@ inline size_t extremum_element_with_nan(const T *v, size_t size)
         return 0;
     size_t idx_of_extremum = 0;
     T extremum = v[0];
-    [[maybe_unused]] bool extremum_is_nan = std::isnan(extremum);
+    [[maybe_unused]] bool extremum_is_invalid = false;
+    if constexpr (std::is_floating_point_v<T>)
+    {
+        extremum_is_invalid = std::isnan(extremum);
+    }
+    if constexpr (HAS_NODATA)
+    {
+        if (extremum == noDataValue)
+            extremum_is_invalid = true;
+    }
     size_t i = 1;
 
     constexpr size_t VALS_PER_REG = sizeof(set1(extremum)) / sizeof(extremum);
@@ -274,22 +320,43 @@ inline size_t extremum_element_with_nan(const T *v, size_t size)
     static_assert(LOOP_UNROLLING == 4);
     constexpr size_t VALS_PER_ITER = VALS_PER_REG * LOOP_UNROLLING;
 
-    const auto update =
-        [v, &extremum, &idx_of_extremum, &extremum_is_nan](size_t idx)
+    const auto update = [v, noDataValue, &extremum, &idx_of_extremum,
+                         &extremum_is_invalid](size_t idx)
     {
+        if constexpr (HAS_NODATA)
+        {
+            if (v[idx] == noDataValue)
+                return;
+            if (extremum_is_invalid)
+            {
+                if constexpr (std::is_floating_point_v<T>)
+                {
+                    if (std::isnan(v[idx]))
+                        return;
+                }
+                extremum = v[idx];
+                idx_of_extremum = idx;
+                extremum_is_invalid = false;
+                return;
+            }
+        }
+        else
+        {
+            CPL_IGNORE_RET_VAL(noDataValue);
+        }
         if (compScalar<T, IS_MAX>(v[idx], extremum))
         {
             extremum = v[idx];
             idx_of_extremum = idx;
-            extremum_is_nan = false;
+            extremum_is_invalid = false;
         }
         else if constexpr (std::is_floating_point_v<T>)
         {
-            if (extremum_is_nan && !std::isnan(v[idx]))
+            if (extremum_is_invalid && !std::isnan(v[idx]))
             {
                 extremum = v[idx];
                 idx_of_extremum = idx;
-                extremum_is_nan = false;
+                extremum_is_invalid = false;
             }
         }
     };
@@ -299,6 +366,24 @@ inline size_t extremum_element_with_nan(const T *v, size_t size)
         update(i);
     }
 
+    [[maybe_unused]] auto sse_neutral = set1_unshifted(static_cast<T>(0));
+    [[maybe_unused]] auto sse_nodata = set1_unshifted(noDataValue);
+    if constexpr (HAS_NODATA)
+    {
+        for (; i < size && extremum_is_invalid; ++i)
+        {
+            update(i);
+        }
+        if (!extremum_is_invalid)
+        {
+            for (; i < size && (i % VALS_PER_ITER) != 0; ++i)
+            {
+                update(i);
+            }
+            sse_neutral = set1_unshifted(extremum);
+        }
+    }
+
     auto sse_extremum = set1(extremum);
 
     [[maybe_unused]] size_t hits = 0;
@@ -306,10 +391,103 @@ inline size_t extremum_element_with_nan(const T *v, size_t size)
     for (; i < sse_iter_count; i += VALS_PER_ITER)
     {
         // A bit of loop unrolling to save 3/4 of slow movemask operations.
-        const auto sse_val0 = loadv(v + i + 0 * VALS_PER_REG);
-        const auto sse_val1 = loadv(v + i + 1 * VALS_PER_REG);
-        const auto sse_val2 = loadv(v + i + 2 * VALS_PER_REG);
-        const auto sse_val3 = loadv(v + i + 3 * VALS_PER_REG);
+        auto sse_val0 = loadv(v + i + 0 * VALS_PER_REG);
+        auto sse_val1 = loadv(v + i + 1 * VALS_PER_REG);
+        auto sse_val2 = loadv(v + i + 2 * VALS_PER_REG);
+        auto sse_val3 = loadv(v + i + 3 * VALS_PER_REG);
+
+        if constexpr (HAS_NODATA)
+        {
+            // Replace all components that are at the nodata value by a
+            // neutral value (current minimum)
+            if constexpr (std::is_same_v<T, uint8_t> ||
+                          std::is_same_v<T, int8_t>)
+            {
+                const auto replaceNoDataByNeutral =
+                    [sse_neutral, sse_nodata](auto sse_val)
+                {
+                    const auto eq_nodata = _mm_cmpeq_epi8(sse_val, sse_nodata);
+                    return _mm_or_si128(_mm_and_si128(eq_nodata, sse_neutral),
+                                        _mm_andnot_si128(eq_nodata, sse_val));
+                };
+
+                sse_val0 = replaceNoDataByNeutral(sse_val0);
+                sse_val1 = replaceNoDataByNeutral(sse_val1);
+                sse_val2 = replaceNoDataByNeutral(sse_val2);
+                sse_val3 = replaceNoDataByNeutral(sse_val3);
+            }
+            else if constexpr (std::is_same_v<T, uint16_t> ||
+                               std::is_same_v<T, int16_t>)
+            {
+                const auto replaceNoDataByNeutral =
+                    [sse_neutral, sse_nodata](auto sse_val)
+                {
+                    const auto eq_nodata = _mm_cmpeq_epi16(sse_val, sse_nodata);
+                    return _mm_or_si128(_mm_and_si128(eq_nodata, sse_neutral),
+                                        _mm_andnot_si128(eq_nodata, sse_val));
+                };
+
+                sse_val0 = replaceNoDataByNeutral(sse_val0);
+                sse_val1 = replaceNoDataByNeutral(sse_val1);
+                sse_val2 = replaceNoDataByNeutral(sse_val2);
+                sse_val3 = replaceNoDataByNeutral(sse_val3);
+            }
+            else if constexpr (std::is_same_v<T, uint32_t> ||
+                               std::is_same_v<T, int32_t>)
+            {
+                const auto replaceNoDataByNeutral =
+                    [sse_neutral, sse_nodata](auto sse_val)
+                {
+                    const auto eq_nodata = _mm_cmpeq_epi32(sse_val, sse_nodata);
+                    return _mm_or_si128(_mm_and_si128(eq_nodata, sse_neutral),
+                                        _mm_andnot_si128(eq_nodata, sse_val));
+                };
+
+                sse_val0 = replaceNoDataByNeutral(sse_val0);
+                sse_val1 = replaceNoDataByNeutral(sse_val1);
+                sse_val2 = replaceNoDataByNeutral(sse_val2);
+                sse_val3 = replaceNoDataByNeutral(sse_val3);
+            }
+            else if constexpr (std::is_same_v<T, float>)
+            {
+                const auto replaceNoDataByNeutral =
+                    [sse_neutral, sse_nodata](auto sse_val)
+                {
+                    const auto eq_nodata = _mm_cmpeq_ps(sse_val, sse_nodata);
+                    return _mm_or_ps(_mm_and_ps(eq_nodata, sse_neutral),
+                                     _mm_andnot_ps(eq_nodata, sse_val));
+                };
+
+                sse_val0 = replaceNoDataByNeutral(sse_val0);
+                sse_val1 = replaceNoDataByNeutral(sse_val1);
+                sse_val2 = replaceNoDataByNeutral(sse_val2);
+                sse_val3 = replaceNoDataByNeutral(sse_val3);
+            }
+            else if constexpr (std::is_same_v<T, double>)
+            {
+                const auto replaceNoDataByNeutral =
+                    [sse_neutral, sse_nodata](auto sse_val)
+                {
+                    const auto eq_nodata = _mm_cmpeq_pd(sse_val, sse_nodata);
+                    return _mm_or_pd(_mm_and_pd(eq_nodata, sse_neutral),
+                                     _mm_andnot_pd(eq_nodata, sse_val));
+                };
+
+                sse_val0 = replaceNoDataByNeutral(sse_val0);
+                sse_val1 = replaceNoDataByNeutral(sse_val1);
+                sse_val2 = replaceNoDataByNeutral(sse_val2);
+                sse_val3 = replaceNoDataByNeutral(sse_val3);
+            }
+            else
+            {
+                static_assert(
+                    std::is_same_v<T, uint8_t> || std::is_same_v<T, int8_t> ||
+                    std::is_same_v<T, uint16_t> || std::is_same_v<T, int16_t> ||
+                    std::is_same_v<T, uint32_t> || std::is_same_v<T, int32_t> ||
+                    std::is_same_v<T, float> || std::is_same_v<T, double>);
+            }
+        }
+
         if (_mm_movemask_epi8(_mm_or_si128(
                 _mm_or_si128(comp<T, IS_MAX>(sse_val0, sse_extremum),
                              comp<T, IS_MAX>(sse_val1, sse_extremum)),
@@ -337,6 +515,10 @@ inline size_t extremum_element_with_nan(const T *v, size_t size)
                 update(i + j);
             }
             sse_extremum = set1(extremum);
+            if constexpr (HAS_NODATA)
+            {
+                sse_neutral = set1_unshifted(extremum);
+            }
         }
     }
     for (; i < size; ++i)
@@ -352,9 +534,10 @@ inline size_t extremum_element_with_nan(const T *v, size_t size)
 /*                    extremum_element_with_nan()                       */
 /************************************************************************/
 
-template <class T, bool IS_MAX>
-inline size_t extremum_element_with_nan(const T *v, size_t size)
+template <class T, bool IS_MAX, bool HAS_NODATA>
+inline size_t extremum_element_with_nan(const T *v, size_t size, T /* nodata */)
 {
+    static_assert(!HAS_NODATA);
     if (size == 0)
         return 0;
     size_t idx_of_extremum = 0;
@@ -379,98 +562,217 @@ inline size_t extremum_element_with_nan(const T *v, size_t size)
 /*                         extremum_element()                           */
 /************************************************************************/
 
-#if defined(__x86_64) || defined(_M_X64)
+#ifdef GDAL_MINMAX_ELEMENT_USE_SSE2
+
+template <>
+size_t extremum_element<uint8_t, true>(const uint8_t *v, size_t size,
+                                       uint8_t noDataValue)
+{
+    return extremum_element_with_nan<uint8_t, true, true>(v, size, noDataValue);
+}
+
+template <>
+size_t extremum_element<uint8_t, false>(const uint8_t *v, size_t size,
+                                        uint8_t noDataValue)
+{
+    return extremum_element_with_nan<uint8_t, false, true>(v, size,
+                                                           noDataValue);
+}
 
 template <>
 size_t extremum_element<uint8_t, true>(const uint8_t *v, size_t size)
 {
-    return extremum_element_with_nan<uint8_t, true>(v, size);
+    return extremum_element_with_nan<uint8_t, true, false>(v, size, 0);
 }
 
 template <>
 size_t extremum_element<uint8_t, false>(const uint8_t *v, size_t size)
 {
-    return extremum_element_with_nan<uint8_t, false>(v, size);
+    return extremum_element_with_nan<uint8_t, false, false>(v, size, 0);
+}
+
+template <>
+size_t extremum_element<int8_t, true>(const int8_t *v, size_t size,
+                                      int8_t noDataValue)
+{
+    return extremum_element_with_nan<int8_t, true, true>(v, size, noDataValue);
+}
+
+template <>
+size_t extremum_element<int8_t, false>(const int8_t *v, size_t size,
+                                       int8_t noDataValue)
+{
+    return extremum_element_with_nan<int8_t, false, true>(v, size, noDataValue);
 }
 
 template <> size_t extremum_element<int8_t, true>(const int8_t *v, size_t size)
 {
-    return extremum_element_with_nan<int8_t, true>(v, size);
+    return extremum_element_with_nan<int8_t, true, false>(v, size, 0);
 }
 
 template <> size_t extremum_element<int8_t, false>(const int8_t *v, size_t size)
 {
-    return extremum_element_with_nan<int8_t, false>(v, size);
+    return extremum_element_with_nan<int8_t, false, false>(v, size, 0);
+}
+
+template <>
+size_t extremum_element<uint16_t, true>(const uint16_t *v, size_t size,
+                                        uint16_t noDataValue)
+{
+    return extremum_element_with_nan<uint16_t, true, true>(v, size,
+                                                           noDataValue);
+}
+
+template <>
+size_t extremum_element<uint16_t, false>(const uint16_t *v, size_t size,
+                                         uint16_t noDataValue)
+{
+    return extremum_element_with_nan<uint16_t, false, true>(v, size,
+                                                            noDataValue);
 }
 
 template <>
 size_t extremum_element<uint16_t, true>(const uint16_t *v, size_t size)
 {
-    return extremum_element_with_nan<uint16_t, true>(v, size);
+    return extremum_element_with_nan<uint16_t, true, false>(v, size, 0);
 }
 
 template <>
 size_t extremum_element<uint16_t, false>(const uint16_t *v, size_t size)
 {
-    return extremum_element_with_nan<uint16_t, false>(v, size);
+    return extremum_element_with_nan<uint16_t, false, false>(v, size, 0);
+}
+
+template <>
+size_t extremum_element<int16_t, true>(const int16_t *v, size_t size,
+                                       int16_t noDataValue)
+{
+    return extremum_element_with_nan<int16_t, true, true>(v, size, noDataValue);
+}
+
+template <>
+size_t extremum_element<int16_t, false>(const int16_t *v, size_t size,
+                                        int16_t noDataValue)
+{
+    return extremum_element_with_nan<int16_t, false, true>(v, size,
+                                                           noDataValue);
 }
 
 template <>
 size_t extremum_element<int16_t, true>(const int16_t *v, size_t size)
 {
-    return extremum_element_with_nan<int16_t, true>(v, size);
+    return extremum_element_with_nan<int16_t, true, false>(v, size, 0);
 }
 
 template <>
 size_t extremum_element<int16_t, false>(const int16_t *v, size_t size)
 {
-    return extremum_element_with_nan<int16_t, false>(v, size);
+    return extremum_element_with_nan<int16_t, false, false>(v, size, 0);
+}
+
+template <>
+size_t extremum_element<uint32_t, true>(const uint32_t *v, size_t size,
+                                        uint32_t noDataValue)
+{
+    return extremum_element_with_nan<uint32_t, true, true>(v, size,
+                                                           noDataValue);
+}
+
+template <>
+size_t extremum_element<uint32_t, false>(const uint32_t *v, size_t size,
+                                         uint32_t noDataValue)
+{
+    return extremum_element_with_nan<uint32_t, false, true>(v, size,
+                                                            noDataValue);
 }
 
 template <>
 size_t extremum_element<uint32_t, true>(const uint32_t *v, size_t size)
 {
-    return extremum_element_with_nan<uint32_t, true>(v, size);
+    return extremum_element_with_nan<uint32_t, true, false>(v, size, 0);
 }
 
 template <>
 size_t extremum_element<uint32_t, false>(const uint32_t *v, size_t size)
 {
-    return extremum_element_with_nan<uint32_t, false>(v, size);
+    return extremum_element_with_nan<uint32_t, false, false>(v, size, 0);
+}
+
+template <>
+size_t extremum_element<int32_t, true>(const int32_t *v, size_t size,
+                                       int32_t noDataValue)
+{
+    return extremum_element_with_nan<int32_t, true, true>(v, size, noDataValue);
+}
+
+template <>
+size_t extremum_element<int32_t, false>(const int32_t *v, size_t size,
+                                        int32_t noDataValue)
+{
+    return extremum_element_with_nan<int32_t, false, true>(v, size,
+                                                           noDataValue);
 }
 
 template <>
 size_t extremum_element<int32_t, true>(const int32_t *v, size_t size)
 {
-    return extremum_element_with_nan<int32_t, true>(v, size);
+    return extremum_element_with_nan<int32_t, true, false>(v, size, 0);
 }
 
 template <>
 size_t extremum_element<int32_t, false>(const int32_t *v, size_t size)
 {
-    return extremum_element_with_nan<int32_t, false>(v, size);
+    return extremum_element_with_nan<int32_t, false, false>(v, size, 0);
+}
+
+template <>
+size_t extremum_element<float, true>(const float *v, size_t size,
+                                     float noDataValue)
+{
+    return extremum_element_with_nan<float, true, true>(v, size, noDataValue);
+}
+
+template <>
+size_t extremum_element<float, false>(const float *v, size_t size,
+                                      float noDataValue)
+{
+    return extremum_element_with_nan<float, false, true>(v, size, noDataValue);
+}
+
+template <>
+size_t extremum_element<double, true>(const double *v, size_t size,
+                                      double noDataValue)
+{
+    return extremum_element_with_nan<double, true, true>(v, size, noDataValue);
+}
+
+template <>
+size_t extremum_element<double, false>(const double *v, size_t size,
+                                       double noDataValue)
+{
+    return extremum_element_with_nan<double, false, true>(v, size, noDataValue);
 }
 
 #endif
 
 template <> size_t extremum_element<float, true>(const float *v, size_t size)
 {
-    return extremum_element_with_nan<float, true>(v, size);
+    return extremum_element_with_nan<float, true, false>(v, size, 0);
 }
 
 template <> size_t extremum_element<double, true>(const double *v, size_t size)
 {
-    return extremum_element_with_nan<double, true>(v, size);
+    return extremum_element_with_nan<double, true, false>(v, size, 0);
 }
 
 template <> size_t extremum_element<float, false>(const float *v, size_t size)
 {
-    return extremum_element_with_nan<float, false>(v, size);
+    return extremum_element_with_nan<float, false, false>(v, size, 0);
 }
 
 template <> size_t extremum_element<double, false>(const double *v, size_t size)
 {
-    return extremum_element_with_nan<double, false>(v, size);
+    return extremum_element_with_nan<double, false, false>(v, size, 0);
 }
 
 /************************************************************************/
@@ -481,7 +783,7 @@ template <class T, bool IS_MAX>
 inline size_t extremum_element_with_nan(const T *v, size_t size, T noDataValue)
 {
     if (std::isnan(noDataValue))
-        return extremum_element_with_nan<T, IS_MAX>(v, size);
+        return extremum_element_with_nan<T, IS_MAX, false>(v, size, 0);
     if (size == 0)
         return 0;
     size_t idx_of_extremum = 0;
@@ -507,6 +809,8 @@ inline size_t extremum_element_with_nan(const T *v, size_t size, T noDataValue)
 /*                            extremum_element()                        */
 /************************************************************************/
 
+#if !defined(GDAL_MINMAX_ELEMENT_USE_SSE2)
+
 template <>
 size_t extremum_element<float, true>(const float *v, size_t size,
                                      float noDataValue)
@@ -515,17 +819,17 @@ size_t extremum_element<float, true>(const float *v, size_t size,
 }
 
 template <>
-size_t extremum_element<double, true>(const double *v, size_t size,
-                                      double noDataValue)
+size_t extremum_element<float, false>(const float *v, size_t size,
+                                      float noDataValue)
 {
-    return extremum_element_with_nan<double, true>(v, size, noDataValue);
+    return extremum_element_with_nan<float, false>(v, size, noDataValue);
 }
 
 template <>
-size_t extremum_element<float, false>(const float *v, size_t size,
-                                      float noDataValue)
+size_t extremum_element<double, true>(const double *v, size_t size,
+                                      double noDataValue)
 {
-    return extremum_element_with_nan<float, false>(v, size, noDataValue);
+    return extremum_element_with_nan<double, true>(v, size, noDataValue);
 }
 
 template <>
@@ -535,6 +839,8 @@ size_t extremum_element<double, false>(const double *v, size_t size,
     return extremum_element_with_nan<double, false>(v, size, noDataValue);
 }
 
+#endif
+
 template <class T, bool IS_MAX>
 inline size_t extremum_element(const T *buffer, size_t size, bool bHasNoData,
                                T noDataValue)
diff --git a/perftests/testperf_gdal_minmax_element.cpp b/perftests/testperf_gdal_minmax_element.cpp
index 77968e11f850..c89ee9d50149 100644
--- a/perftests/testperf_gdal_minmax_element.cpp
+++ b/perftests/testperf_gdal_minmax_element.cpp
@@ -51,7 +51,7 @@ int main(int /* argc */, char * /* argv */[])
                     gdal::min_element(x.data(), x.size(), eDT, false, 0));
             }
             idx /= N_ITERS;
-            printf("min at idx %d\n", idx);
+            printf("min at idx %d (optimized)\n", idx);
             auto end = std::chrono::steady_clock::now();
             printf("-> elapsed=%d\n", static_cast<int>((end - start).count()));
         }
@@ -77,7 +77,27 @@ int main(int /* argc */, char * /* argv */[])
                     gdal::min_element(x.data(), x.size(), eDT, true, 0));
             }
             idx /= N_ITERS;
-            printf("min at idx %d(nodata case)\n", idx);
+            printf("min at idx %d (nodata case, optimized)\n", idx);
+            auto end = std::chrono::steady_clock::now();
+            printf("-> elapsed=%d\n", static_cast<int>((end - start).count()));
+        }
+        {
+            auto start = std::chrono::steady_clock::now();
+            int idx = 0;
+            for (int i = 0; i < N_ITERS; ++i)
+            {
+                idx += static_cast<int>(std::distance(
+                    x.begin(), std::min_element(x.begin(), x.end(),
+                                                [](T x, T y) {
+                                                    return y == 0   ? true
+                                                           : x == 0 ? false
+                                                                    : x < y;
+                                                })));
+            }
+            idx /= N_ITERS;
+            printf("min at idx %d (nodata case, using std::min_element with "
+                   "nodata aware comparison)\n",
+                   idx);
             auto end = std::chrono::steady_clock::now();
             printf("-> elapsed=%d\n", static_cast<int>((end - start).count()));
         }
@@ -99,7 +119,7 @@ int main(int /* argc */, char * /* argv */[])
                     gdal::min_element(x.data(), x.size(), eDT, false, 0));
             }
             idx /= N_ITERS;
-            printf("min at idx %d\n", idx);
+            printf("min at idx %d (optimized)\n", idx);
             auto end = std::chrono::steady_clock::now();
             printf("-> elapsed=%d\n", static_cast<int>((end - start).count()));
         }
@@ -125,7 +145,27 @@ int main(int /* argc */, char * /* argv */[])
                     gdal::min_element(x.data(), x.size(), eDT, true, 0));
             }
             idx /= N_ITERS;
-            printf("min at idx %d(nodata case)\n", idx);
+            printf("min at idx %d (nodata case, optimized)\n", idx);
+            auto end = std::chrono::steady_clock::now();
+            printf("-> elapsed=%d\n", static_cast<int>((end - start).count()));
+        }
+        {
+            auto start = std::chrono::steady_clock::now();
+            int idx = 0;
+            for (int i = 0; i < N_ITERS; ++i)
+            {
+                idx += static_cast<int>(std::distance(
+                    x.begin(), std::min_element(x.begin(), x.end(),
+                                                [](T x, T y) {
+                                                    return y == 0   ? true
+                                                           : x == 0 ? false
+                                                                    : x < y;
+                                                })));
+            }
+            idx /= N_ITERS;
+            printf("min at idx %d (nodata case, using std::min_element with "
+                   "nodata aware comparison)\n",
+                   idx);
             auto end = std::chrono::steady_clock::now();
             printf("-> elapsed=%d\n", static_cast<int>((end - start).count()));
         }
@@ -147,7 +187,7 @@ int main(int /* argc */, char * /* argv */[])
                     gdal::min_element(x.data(), x.size(), eDT, false, 0));
             }
             idx /= N_ITERS;
-            printf("min at idx %d\n", idx);
+            printf("min at idx %d (optimized)\n", idx);
             auto end = std::chrono::steady_clock::now();
             printf("-> elapsed=%d\n", static_cast<int>((end - start).count()));
         }
@@ -173,7 +213,27 @@ int main(int /* argc */, char * /* argv */[])
                     gdal::min_element(x.data(), x.size(), eDT, true, 0));
             }
             idx /= N_ITERS;
-            printf("min at idx %d(nodata case)\n", idx);
+            printf("min at idx %d (nodata case, optimized)\n", idx);
+            auto end = std::chrono::steady_clock::now();
+            printf("-> elapsed=%d\n", static_cast<int>((end - start).count()));
+        }
+        {
+            auto start = std::chrono::steady_clock::now();
+            int idx = 0;
+            for (int i = 0; i < N_ITERS; ++i)
+            {
+                idx += static_cast<int>(std::distance(
+                    x.begin(), std::min_element(x.begin(), x.end(),
+                                                [](T x, T y) {
+                                                    return y == 0   ? true
+                                                           : x == 0 ? false
+                                                                    : x < y;
+                                                })));
+            }
+            idx /= N_ITERS;
+            printf("min at idx %d (nodata case, using std::min_element with "
+                   "nodata aware comparison)\n",
+                   idx);
             auto end = std::chrono::steady_clock::now();
             printf("-> elapsed=%d\n", static_cast<int>((end - start).count()));
         }
@@ -195,7 +255,7 @@ int main(int /* argc */, char * /* argv */[])
                     gdal::min_element(x.data(), x.size(), eDT, false, 0));
             }
             idx /= N_ITERS;
-            printf("min at idx %d\n", idx);
+            printf("min at idx %d (optimized)\n", idx);
             auto end = std::chrono::steady_clock::now();
             printf("-> elapsed=%d\n", static_cast<int>((end - start).count()));
         }
@@ -221,7 +281,27 @@ int main(int /* argc */, char * /* argv */[])
                     gdal::min_element(x.data(), x.size(), eDT, true, 0));
             }
             idx /= N_ITERS;
-            printf("min at idx %d(nodata case)\n", idx);
+            printf("min at idx %d (nodata case, optimized)\n", idx);
+            auto end = std::chrono::steady_clock::now();
+            printf("-> elapsed=%d\n", static_cast<int>((end - start).count()));
+        }
+        {
+            auto start = std::chrono::steady_clock::now();
+            int idx = 0;
+            for (int i = 0; i < N_ITERS; ++i)
+            {
+                idx += static_cast<int>(std::distance(
+                    x.begin(), std::min_element(x.begin(), x.end(),
+                                                [](T x, T y) {
+                                                    return y == 0   ? true
+                                                           : x == 0 ? false
+                                                                    : x < y;
+                                                })));
+            }
+            idx /= N_ITERS;
+            printf("min at idx %d (nodata case, using std::min_element with "
+                   "nodata aware comparison)\n",
+                   idx);
             auto end = std::chrono::steady_clock::now();
             printf("-> elapsed=%d\n", static_cast<int>((end - start).count()));
         }
@@ -243,7 +323,7 @@ int main(int /* argc */, char * /* argv */[])
                     gdal::min_element(x.data(), x.size(), eDT, false, 0));
             }
             idx /= N_ITERS;
-            printf("min at idx %d\n", idx);
+            printf("min at idx %d (optimized)\n", idx);
             auto end = std::chrono::steady_clock::now();
             printf("-> elapsed=%d\n", static_cast<int>((end - start).count()));
         }
@@ -269,7 +349,27 @@ int main(int /* argc */, char * /* argv */[])
                     gdal::min_element(x.data(), x.size(), eDT, true, 0));
             }
             idx /= N_ITERS;
-            printf("min at idx %d(nodata case)\n", idx);
+            printf("min at idx %d (nodata case, optimized)\n", idx);
+            auto end = std::chrono::steady_clock::now();
+            printf("-> elapsed=%d\n", static_cast<int>((end - start).count()));
+        }
+        {
+            auto start = std::chrono::steady_clock::now();
+            int idx = 0;
+            for (int i = 0; i < N_ITERS; ++i)
+            {
+                idx += static_cast<int>(std::distance(
+                    x.begin(), std::min_element(x.begin(), x.end(),
+                                                [](T x, T y) {
+                                                    return y == 0   ? true
+                                                           : x == 0 ? false
+                                                                    : x < y;
+                                                })));
+            }
+            idx /= N_ITERS;
+            printf("min at idx %d (nodata case, using std::min_element with "
+                   "nodata aware comparison)\n",
+                   idx);
             auto end = std::chrono::steady_clock::now();
             printf("-> elapsed=%d\n", static_cast<int>((end - start).count()));
         }
@@ -291,7 +391,7 @@ int main(int /* argc */, char * /* argv */[])
                     gdal::min_element(x.data(), x.size(), eDT, false, 0));
             }
             idx /= N_ITERS;
-            printf("min at idx %d\n", idx);
+            printf("min at idx %d (optimized)\n", idx);
             auto end = std::chrono::steady_clock::now();
             printf("-> elapsed=%d\n", static_cast<int>((end - start).count()));
         }
@@ -317,7 +417,27 @@ int main(int /* argc */, char * /* argv */[])
                     gdal::min_element(x.data(), x.size(), eDT, true, 0));
             }
             idx /= N_ITERS;
-            printf("min at idx %d(nodata case)\n", idx);
+            printf("min at idx %d (nodata case, optimized)\n", idx);
+            auto end = std::chrono::steady_clock::now();
+            printf("-> elapsed=%d\n", static_cast<int>((end - start).count()));
+        }
+        {
+            auto start = std::chrono::steady_clock::now();
+            int idx = 0;
+            for (int i = 0; i < N_ITERS; ++i)
+            {
+                idx += static_cast<int>(std::distance(
+                    x.begin(), std::min_element(x.begin(), x.end(),
+                                                [](T x, T y) {
+                                                    return y == 0   ? true
+                                                           : x == 0 ? false
+                                                                    : x < y;
+                                                })));
+            }
+            idx /= N_ITERS;
+            printf("min at idx %d (nodata case, using std::min_element with "
+                   "nodata aware comparison)\n",
+                   idx);
             auto end = std::chrono::steady_clock::now();
             printf("-> elapsed=%d\n", static_cast<int>((end - start).count()));
         }
@@ -326,7 +446,7 @@ int main(int /* argc */, char * /* argv */[])
     {
         using T = float;
         constexpr GDALDataType eDT = GDT_Float32;
-        printf("float:\n");
+        printf("float (*with* NaN):\n");
         std::vector<T> x;
         x.resize(SIZE);
         randomFill(x.data(), x.size());
@@ -339,7 +459,7 @@ int main(int /* argc */, char * /* argv */[])
                     gdal::min_element(x.data(), x.size(), eDT, false, 0));
             }
             idx /= N_ITERS;
-            printf("min at idx %d\n", idx);
+            printf("min at idx %d (optimized)\n", idx);
             auto end = std::chrono::steady_clock::now();
             printf("-> elapsed=%d\n", static_cast<int>((end - start).count()));
         }
@@ -373,7 +493,31 @@ int main(int /* argc */, char * /* argv */[])
                     gdal::min_element(x.data(), x.size(), eDT, true, 0));
             }
             idx /= N_ITERS;
-            printf("min at idx %d(nodata case)\n", idx);
+            printf("min at idx %d (nodata case, optimized)\n", idx);
+            auto end = std::chrono::steady_clock::now();
+            printf("-> elapsed=%d\n", static_cast<int>((end - start).count()));
+        }
+        {
+            auto start = std::chrono::steady_clock::now();
+            int idx = 0;
+            for (int i = 0; i < N_ITERS; ++i)
+            {
+                idx += static_cast<int>(std::distance(
+                    x.begin(), std::min_element(x.begin(), x.end(),
+                                                [](double x, double y)
+                                                {
+                                                    return std::isnan(y) ? true
+                                                           : std::isnan(x)
+                                                               ? false
+                                                           : y == 0 ? true
+                                                           : x == 0 ? false
+                                                                    : x < y;
+                                                })));
+            }
+            idx /= N_ITERS;
+            printf("min at idx %d (nodata case, using std::min_element with "
+                   "nodata aware and NaN aware comparison)\n",
+                   idx);
             auto end = std::chrono::steady_clock::now();
             printf("-> elapsed=%d\n", static_cast<int>((end - start).count()));
         }
@@ -395,7 +539,7 @@ int main(int /* argc */, char * /* argv */[])
                     gdal::min_element(x.data(), x.size(), eDT, false, 0));
             }
             idx /= N_ITERS;
-            printf("min at idx %d\n", idx);
+            printf("min at idx %d (optimized)\n", idx);
             auto end = std::chrono::steady_clock::now();
             printf("-> elapsed=%d\n", static_cast<int>((end - start).count()));
         }
@@ -421,7 +565,27 @@ int main(int /* argc */, char * /* argv */[])
                     gdal::min_element(x.data(), x.size(), eDT, true, 0));
             }
             idx /= N_ITERS;
-            printf("min at idx %d(nodata case)\n", idx);
+            printf("min at idx %d (nodata case, optimized)\n", idx);
+            auto end = std::chrono::steady_clock::now();
+            printf("-> elapsed=%d\n", static_cast<int>((end - start).count()));
+        }
+        {
+            auto start = std::chrono::steady_clock::now();
+            int idx = 0;
+            for (int i = 0; i < N_ITERS; ++i)
+            {
+                idx += static_cast<int>(std::distance(
+                    x.begin(), std::min_element(x.begin(), x.end(),
+                                                [](T x, T y) {
+                                                    return y == 0   ? true
+                                                           : x == 0 ? false
+                                                                    : x < y;
+                                                })));
+            }
+            idx /= N_ITERS;
+            printf("min at idx %d (nodata case, using std::min_element with "
+                   "nodata aware comparison)\n",
+                   idx);
             auto end = std::chrono::steady_clock::now();
             printf("-> elapsed=%d\n", static_cast<int>((end - start).count()));
         }
@@ -430,7 +594,7 @@ int main(int /* argc */, char * /* argv */[])
     {
         using T = double;
         constexpr GDALDataType eDT = GDT_Float64;
-        printf("double:\n");
+        printf("double (*with* NaN):\n");
         std::vector<T> x;
         x.resize(SIZE);
         randomFill(x.data(), x.size());
@@ -443,7 +607,7 @@ int main(int /* argc */, char * /* argv */[])
                     gdal::min_element(x.data(), x.size(), eDT, false, 0));
             }
             idx /= N_ITERS;
-            printf("min at idx %d\n", idx);
+            printf("min at idx %d (optimized)\n", idx);
             auto end = std::chrono::steady_clock::now();
             printf("-> elapsed=%d\n", static_cast<int>((end - start).count()));
         }
@@ -477,7 +641,31 @@ int main(int /* argc */, char * /* argv */[])
                     gdal::min_element(x.data(), x.size(), eDT, true, 0));
             }
             idx /= N_ITERS;
-            printf("min at idx %d(nodata case)\n", idx);
+            printf("min at idx %d (nodata case, optimized)\n", idx);
+            auto end = std::chrono::steady_clock::now();
+            printf("-> elapsed=%d\n", static_cast<int>((end - start).count()));
+        }
+        {
+            auto start = std::chrono::steady_clock::now();
+            int idx = 0;
+            for (int i = 0; i < N_ITERS; ++i)
+            {
+                idx += static_cast<int>(std::distance(
+                    x.begin(), std::min_element(x.begin(), x.end(),
+                                                [](double x, double y)
+                                                {
+                                                    return std::isnan(y) ? true
+                                                           : std::isnan(x)
+                                                               ? false
+                                                           : y == 0 ? true
+                                                           : x == 0 ? false
+                                                                    : x < y;
+                                                })));
+            }
+            idx /= N_ITERS;
+            printf("min at idx %d (nodata case, using std::min_element with "
+                   "nodata aware and NaN aware comparison)\n",
+                   idx);
             auto end = std::chrono::steady_clock::now();
             printf("-> elapsed=%d\n", static_cast<int>((end - start).count()));
         }
@@ -499,7 +687,7 @@ int main(int /* argc */, char * /* argv */[])
                     gdal::min_element(x.data(), x.size(), eDT, false, 0));
             }
             idx /= N_ITERS;
-            printf("min at idx %d\n", idx);
+            printf("min at idx %d (optimized)\n", idx);
             auto end = std::chrono::steady_clock::now();
             printf("-> elapsed=%d\n", static_cast<int>((end - start).count()));
         }
@@ -525,7 +713,27 @@ int main(int /* argc */, char * /* argv */[])
                     gdal::min_element(x.data(), x.size(), eDT, true, 0));
             }
             idx /= N_ITERS;
-            printf("min at idx %d(nodata case)\n", idx);
+            printf("min at idx %d (nodata case, optimized)\n", idx);
+            auto end = std::chrono::steady_clock::now();
+            printf("-> elapsed=%d\n", static_cast<int>((end - start).count()));
+        }
+        {
+            auto start = std::chrono::steady_clock::now();
+            int idx = 0;
+            for (int i = 0; i < N_ITERS; ++i)
+            {
+                idx += static_cast<int>(std::distance(
+                    x.begin(), std::min_element(x.begin(), x.end(),
+                                                [](T x, T y) {
+                                                    return y == 0   ? true
+                                                           : x == 0 ? false
+                                                                    : x < y;
+                                                })));
+            }
+            idx /= N_ITERS;
+            printf("min at idx %d (nodata case, using std::min_element with "
+                   "nodata aware comparison)\n",
+                   idx);
             auto end = std::chrono::steady_clock::now();
             printf("-> elapsed=%d\n", static_cast<int>((end - start).count()));
         }

From 5b332b53632d3194a90a7a5c265076e3a4a58f04 Mon Sep 17 00:00:00 2001
From: Even Rouault <even.rouault@spatialys.com>
Date: Sun, 3 Nov 2024 23:01:13 +0100
Subject: [PATCH 13/62] testperf_gdal_minmax_element.cpp: fix -Wshadow compiler
 warnings

---
 perftests/testperf_gdal_minmax_element.cpp | 104 ++++++++++-----------
 1 file changed, 52 insertions(+), 52 deletions(-)

diff --git a/perftests/testperf_gdal_minmax_element.cpp b/perftests/testperf_gdal_minmax_element.cpp
index c89ee9d50149..862b0613d7f1 100644
--- a/perftests/testperf_gdal_minmax_element.cpp
+++ b/perftests/testperf_gdal_minmax_element.cpp
@@ -88,10 +88,10 @@ int main(int /* argc */, char * /* argv */[])
             {
                 idx += static_cast<int>(std::distance(
                     x.begin(), std::min_element(x.begin(), x.end(),
-                                                [](T x, T y) {
-                                                    return y == 0   ? true
-                                                           : x == 0 ? false
-                                                                    : x < y;
+                                                [](T a, T b) {
+                                                    return b == 0   ? true
+                                                           : a == 0 ? false
+                                                                    : a < b;
                                                 })));
             }
             idx /= N_ITERS;
@@ -156,10 +156,10 @@ int main(int /* argc */, char * /* argv */[])
             {
                 idx += static_cast<int>(std::distance(
                     x.begin(), std::min_element(x.begin(), x.end(),
-                                                [](T x, T y) {
-                                                    return y == 0   ? true
-                                                           : x == 0 ? false
-                                                                    : x < y;
+                                                [](T a, T b) {
+                                                    return b == 0   ? true
+                                                           : a == 0 ? false
+                                                                    : a < b;
                                                 })));
             }
             idx /= N_ITERS;
@@ -224,10 +224,10 @@ int main(int /* argc */, char * /* argv */[])
             {
                 idx += static_cast<int>(std::distance(
                     x.begin(), std::min_element(x.begin(), x.end(),
-                                                [](T x, T y) {
-                                                    return y == 0   ? true
-                                                           : x == 0 ? false
-                                                                    : x < y;
+                                                [](T a, T b) {
+                                                    return b == 0   ? true
+                                                           : a == 0 ? false
+                                                                    : a < b;
                                                 })));
             }
             idx /= N_ITERS;
@@ -292,10 +292,10 @@ int main(int /* argc */, char * /* argv */[])
             {
                 idx += static_cast<int>(std::distance(
                     x.begin(), std::min_element(x.begin(), x.end(),
-                                                [](T x, T y) {
-                                                    return y == 0   ? true
-                                                           : x == 0 ? false
-                                                                    : x < y;
+                                                [](T a, T b) {
+                                                    return b == 0   ? true
+                                                           : a == 0 ? false
+                                                                    : a < b;
                                                 })));
             }
             idx /= N_ITERS;
@@ -360,10 +360,10 @@ int main(int /* argc */, char * /* argv */[])
             {
                 idx += static_cast<int>(std::distance(
                     x.begin(), std::min_element(x.begin(), x.end(),
-                                                [](T x, T y) {
-                                                    return y == 0   ? true
-                                                           : x == 0 ? false
-                                                                    : x < y;
+                                                [](T a, T b) {
+                                                    return b == 0   ? true
+                                                           : a == 0 ? false
+                                                                    : a < b;
                                                 })));
             }
             idx /= N_ITERS;
@@ -428,10 +428,10 @@ int main(int /* argc */, char * /* argv */[])
             {
                 idx += static_cast<int>(std::distance(
                     x.begin(), std::min_element(x.begin(), x.end(),
-                                                [](T x, T y) {
-                                                    return y == 0   ? true
-                                                           : x == 0 ? false
-                                                                    : x < y;
+                                                [](T a, T b) {
+                                                    return b == 0   ? true
+                                                           : a == 0 ? false
+                                                                    : a < b;
                                                 })));
             }
             idx /= N_ITERS;
@@ -470,11 +470,11 @@ int main(int /* argc */, char * /* argv */[])
             {
                 idx += static_cast<int>(std::distance(
                     x.begin(), std::min_element(x.begin(), x.end(),
-                                                [](float x, float y) {
-                                                    return std::isnan(y) ? true
-                                                           : std::isnan(x)
+                                                [](T a, T b) {
+                                                    return std::isnan(b) ? true
+                                                           : std::isnan(a)
                                                                ? false
-                                                               : x < y;
+                                                               : a < b;
                                                 })));
             }
             idx /= N_ITERS;
@@ -504,14 +504,14 @@ int main(int /* argc */, char * /* argv */[])
             {
                 idx += static_cast<int>(std::distance(
                     x.begin(), std::min_element(x.begin(), x.end(),
-                                                [](double x, double y)
+                                                [](T a, T b)
                                                 {
-                                                    return std::isnan(y) ? true
-                                                           : std::isnan(x)
+                                                    return std::isnan(b) ? true
+                                                           : std::isnan(a)
                                                                ? false
-                                                           : y == 0 ? true
-                                                           : x == 0 ? false
-                                                                    : x < y;
+                                                           : b == 0 ? true
+                                                           : a == 0 ? false
+                                                                    : a < b;
                                                 })));
             }
             idx /= N_ITERS;
@@ -576,10 +576,10 @@ int main(int /* argc */, char * /* argv */[])
             {
                 idx += static_cast<int>(std::distance(
                     x.begin(), std::min_element(x.begin(), x.end(),
-                                                [](T x, T y) {
-                                                    return y == 0   ? true
-                                                           : x == 0 ? false
-                                                                    : x < y;
+                                                [](T a, T b) {
+                                                    return b == 0   ? true
+                                                           : a == 0 ? false
+                                                                    : a < b;
                                                 })));
             }
             idx /= N_ITERS;
@@ -618,11 +618,11 @@ int main(int /* argc */, char * /* argv */[])
             {
                 idx += static_cast<int>(std::distance(
                     x.begin(), std::min_element(x.begin(), x.end(),
-                                                [](double x, double y) {
-                                                    return std::isnan(y) ? true
-                                                           : std::isnan(x)
+                                                [](T a, T b) {
+                                                    return std::isnan(b) ? true
+                                                           : std::isnan(a)
                                                                ? false
-                                                               : x < y;
+                                                               : a < b;
                                                 })));
             }
             idx /= N_ITERS;
@@ -652,14 +652,14 @@ int main(int /* argc */, char * /* argv */[])
             {
                 idx += static_cast<int>(std::distance(
                     x.begin(), std::min_element(x.begin(), x.end(),
-                                                [](double x, double y)
+                                                [](T a, T b)
                                                 {
-                                                    return std::isnan(y) ? true
-                                                           : std::isnan(x)
+                                                    return std::isnan(b) ? true
+                                                           : std::isnan(a)
                                                                ? false
-                                                           : y == 0 ? true
-                                                           : x == 0 ? false
-                                                                    : x < y;
+                                                           : b == 0 ? true
+                                                           : a == 0 ? false
+                                                                    : a < b;
                                                 })));
             }
             idx /= N_ITERS;
@@ -724,10 +724,10 @@ int main(int /* argc */, char * /* argv */[])
             {
                 idx += static_cast<int>(std::distance(
                     x.begin(), std::min_element(x.begin(), x.end(),
-                                                [](T x, T y) {
-                                                    return y == 0   ? true
-                                                           : x == 0 ? false
-                                                                    : x < y;
+                                                [](T a, T b) {
+                                                    return b == 0   ? true
+                                                           : a == 0 ? false
+                                                                    : a < b;
                                                 })));
             }
             idx /= N_ITERS;

From 0b95d469a67d222eeb814f1672e0f3bff773bb3b Mon Sep 17 00:00:00 2001
From: Even Rouault <even.rouault@spatialys.com>
Date: Sun, 3 Nov 2024 23:52:25 +0100
Subject: [PATCH 14/62] testperf_gdal_minmax_element.cpp: factor code

---
 perftests/testperf_gdal_minmax_element.cpp | 866 ++++++---------------
 1 file changed, 228 insertions(+), 638 deletions(-)

diff --git a/perftests/testperf_gdal_minmax_element.cpp b/perftests/testperf_gdal_minmax_element.cpp
index 862b0613d7f1..2e2a0d2f408f 100644
--- a/perftests/testperf_gdal_minmax_element.cpp
+++ b/perftests/testperf_gdal_minmax_element.cpp
@@ -31,712 +31,302 @@ template <class T> void randomFill(T *v, size_t size, bool withNaN = true)
     }
 }
 
-int main(int /* argc */, char * /* argv */[])
+constexpr size_t SIZE = 10 * 1000 * 1000 + 1;
+constexpr int N_ITERS = 1;
+
+template <class T>
+#if defined(__GNUC__)
+__attribute__((noinline))
+#endif
+static void
+benchIntegers(GDALDataType eDT, T noData)
 {
-    constexpr size_t SIZE = 10 * 1000 * 1000 + 1;
-    constexpr int N_ITERS = 1;
+    std::vector<T> x;
+    x.resize(SIZE);
+    randomFill(x.data(), x.size());
     {
-        using T = uint8_t;
-        constexpr GDALDataType eDT = GDT_Byte;
-        printf("uint8:\n");
-        std::vector<T> x;
-        x.resize(SIZE);
-        randomFill(x.data(), x.size());
+        auto start = std::chrono::steady_clock::now();
+        int idx = 0;
+        for (int i = 0; i < N_ITERS; ++i)
         {
-            auto start = std::chrono::steady_clock::now();
-            int idx = 0;
-            for (int i = 0; i < N_ITERS; ++i)
-            {
-                idx += static_cast<int>(
-                    gdal::min_element(x.data(), x.size(), eDT, false, 0));
-            }
-            idx /= N_ITERS;
-            printf("min at idx %d (optimized)\n", idx);
-            auto end = std::chrono::steady_clock::now();
-            printf("-> elapsed=%d\n", static_cast<int>((end - start).count()));
+            idx += static_cast<int>(
+                gdal::min_element(x.data(), x.size(), eDT, false, 0));
         }
+        idx /= N_ITERS;
+        printf("min at idx %d (optimized)\n", idx);
+        auto end = std::chrono::steady_clock::now();
+        printf("-> elapsed=%d\n", static_cast<int>((end - start).count()));
+    }
+    {
+        auto start = std::chrono::steady_clock::now();
+        int idx = 0;
+        for (int i = 0; i < N_ITERS; ++i)
         {
-            auto start = std::chrono::steady_clock::now();
-            int idx = 0;
-            for (int i = 0; i < N_ITERS; ++i)
-            {
-                idx += static_cast<int>(std::distance(
-                    x.begin(), std::min_element(x.begin(), x.end())));
-            }
-            idx /= N_ITERS;
-            printf("min at idx %d (using std::min_element)\n", idx);
-            auto end = std::chrono::steady_clock::now();
-            printf("-> elapsed=%d\n", static_cast<int>((end - start).count()));
+            idx += static_cast<int>(
+                std::distance(x.begin(), std::min_element(x.begin(), x.end())));
         }
+        idx /= N_ITERS;
+        printf("min at idx %d (using std::min_element)\n", idx);
+        auto end = std::chrono::steady_clock::now();
+        printf("-> elapsed=%d\n", static_cast<int>((end - start).count()));
+    }
+    {
+        auto start = std::chrono::steady_clock::now();
+        int idx = 0;
+        for (int i = 0; i < N_ITERS; ++i)
         {
-            auto start = std::chrono::steady_clock::now();
-            int idx = 0;
-            for (int i = 0; i < N_ITERS; ++i)
-            {
-                idx += static_cast<int>(
-                    gdal::min_element(x.data(), x.size(), eDT, true, 0));
-            }
-            idx /= N_ITERS;
-            printf("min at idx %d (nodata case, optimized)\n", idx);
-            auto end = std::chrono::steady_clock::now();
-            printf("-> elapsed=%d\n", static_cast<int>((end - start).count()));
+            idx += static_cast<int>(
+                gdal::min_element(x.data(), x.size(), eDT, true, noData));
         }
+        idx /= N_ITERS;
+        printf("min at idx %d (nodata case, optimized)\n", idx);
+        auto end = std::chrono::steady_clock::now();
+        printf("-> elapsed=%d\n", static_cast<int>((end - start).count()));
+    }
+    {
+        auto start = std::chrono::steady_clock::now();
+        int idx = 0;
+        for (int i = 0; i < N_ITERS; ++i)
+        {
+            idx += static_cast<int>(std::distance(
+                x.begin(), std::min_element(x.begin(), x.end(),
+                                            [noData](T a, T b) {
+                                                return b == noData   ? true
+                                                       : a == noData ? false
+                                                                     : a < b;
+                                            })));
+        }
+        idx /= N_ITERS;
+        printf("min at idx %d (nodata case, using std::min_element with "
+               "nodata aware comparison)\n",
+               idx);
+        auto end = std::chrono::steady_clock::now();
+        printf("-> elapsed=%d\n", static_cast<int>((end - start).count()));
+    }
+}
+
+template <class T>
+#if defined(__GNUC__)
+__attribute__((noinline))
+#endif
+static void
+benchFloatingPointsWithNaN(GDALDataType eDT, T noData)
+{
+    std::vector<T> x;
+    x.resize(SIZE);
+    randomFill(x.data(), x.size());
+    {
+        auto start = std::chrono::steady_clock::now();
+        int idx = 0;
+        for (int i = 0; i < N_ITERS; ++i)
         {
-            auto start = std::chrono::steady_clock::now();
-            int idx = 0;
-            for (int i = 0; i < N_ITERS; ++i)
-            {
-                idx += static_cast<int>(std::distance(
-                    x.begin(), std::min_element(x.begin(), x.end(),
-                                                [](T a, T b) {
-                                                    return b == 0   ? true
-                                                           : a == 0 ? false
-                                                                    : a < b;
-                                                })));
-            }
-            idx /= N_ITERS;
-            printf("min at idx %d (nodata case, using std::min_element with "
-                   "nodata aware comparison)\n",
-                   idx);
-            auto end = std::chrono::steady_clock::now();
-            printf("-> elapsed=%d\n", static_cast<int>((end - start).count()));
+            idx += static_cast<int>(
+                gdal::min_element(x.data(), x.size(), eDT, false, 0));
         }
+        idx /= N_ITERS;
+        printf("min at idx %d (optimized)\n", idx);
+        auto end = std::chrono::steady_clock::now();
+        printf("-> elapsed=%d\n", static_cast<int>((end - start).count()));
     }
-    printf("--------------------\n");
     {
-        using T = int8_t;
-        constexpr GDALDataType eDT = GDT_Int8;
-        printf("int8:\n");
-        std::vector<T> x;
-        x.resize(SIZE);
-        randomFill(x.data(), x.size());
+        auto start = std::chrono::steady_clock::now();
+        int idx = 0;
+        for (int i = 0; i < N_ITERS; ++i)
+        {
+            idx += static_cast<int>(std::distance(
+                x.begin(), std::min_element(x.begin(), x.end(),
+                                            [](T a, T b) {
+                                                return std::isnan(b)   ? true
+                                                       : std::isnan(a) ? false
+                                                                       : a < b;
+                                            })));
+        }
+        idx /= N_ITERS;
+        printf("min at idx %d (using std::min_element with NaN aware "
+               "comparison)\n",
+               idx);
+        auto end = std::chrono::steady_clock::now();
+        printf("-> elapsed=%d\n", static_cast<int>((end - start).count()));
+    }
+    {
+        auto start = std::chrono::steady_clock::now();
+        int idx = 0;
+        for (int i = 0; i < N_ITERS; ++i)
         {
-            auto start = std::chrono::steady_clock::now();
-            int idx = 0;
-            for (int i = 0; i < N_ITERS; ++i)
-            {
-                idx += static_cast<int>(
-                    gdal::min_element(x.data(), x.size(), eDT, false, 0));
-            }
-            idx /= N_ITERS;
-            printf("min at idx %d (optimized)\n", idx);
-            auto end = std::chrono::steady_clock::now();
-            printf("-> elapsed=%d\n", static_cast<int>((end - start).count()));
+            idx += static_cast<int>(
+                gdal::min_element(x.data(), x.size(), eDT, true, noData));
         }
+        idx /= N_ITERS;
+        printf("min at idx %d (nodata case, optimized)\n", idx);
+        auto end = std::chrono::steady_clock::now();
+        printf("-> elapsed=%d\n", static_cast<int>((end - start).count()));
+    }
+    {
+        auto start = std::chrono::steady_clock::now();
+        int idx = 0;
+        for (int i = 0; i < N_ITERS; ++i)
+        {
+            idx += static_cast<int>(std::distance(
+                x.begin(), std::min_element(x.begin(), x.end(),
+                                            [noData](T a, T b)
+                                            {
+                                                return std::isnan(b)   ? true
+                                                       : std::isnan(a) ? false
+                                                       : b == noData   ? true
+                                                       : a == noData   ? false
+                                                                       : a < b;
+                                            })));
+        }
+        idx /= N_ITERS;
+        printf("min at idx %d (nodata case, using std::min_element with "
+               "nodata aware and NaN aware comparison)\n",
+               idx);
+        auto end = std::chrono::steady_clock::now();
+        printf("-> elapsed=%d\n", static_cast<int>((end - start).count()));
+    }
+}
+
+template <class T>
+#if defined(__GNUC__)
+__attribute__((noinline))
+#endif
+static void
+benchFloatingPointsWithoutNaN(GDALDataType eDT, T noData)
+{
+    std::vector<T> x;
+    x.resize(SIZE);
+    randomFill(x.data(), x.size(), false);
+    {
+        auto start = std::chrono::steady_clock::now();
+        int idx = 0;
+        for (int i = 0; i < N_ITERS; ++i)
         {
-            auto start = std::chrono::steady_clock::now();
-            int idx = 0;
-            for (int i = 0; i < N_ITERS; ++i)
-            {
-                idx += static_cast<int>(std::distance(
-                    x.begin(), std::min_element(x.begin(), x.end())));
-            }
-            idx /= N_ITERS;
-            printf("min at idx %d (using std::min_element)\n", idx);
-            auto end = std::chrono::steady_clock::now();
-            printf("-> elapsed=%d\n", static_cast<int>((end - start).count()));
+            idx += static_cast<int>(
+                gdal::min_element(x.data(), x.size(), eDT, false, 0));
         }
+        idx /= N_ITERS;
+        printf("min at idx %d (optimized)\n", idx);
+        auto end = std::chrono::steady_clock::now();
+        printf("-> elapsed=%d\n", static_cast<int>((end - start).count()));
+    }
+    {
+        auto start = std::chrono::steady_clock::now();
+        int idx = 0;
+        for (int i = 0; i < N_ITERS; ++i)
         {
-            auto start = std::chrono::steady_clock::now();
-            int idx = 0;
-            for (int i = 0; i < N_ITERS; ++i)
-            {
-                idx += static_cast<int>(
-                    gdal::min_element(x.data(), x.size(), eDT, true, 0));
-            }
-            idx /= N_ITERS;
-            printf("min at idx %d (nodata case, optimized)\n", idx);
-            auto end = std::chrono::steady_clock::now();
-            printf("-> elapsed=%d\n", static_cast<int>((end - start).count()));
+            idx += static_cast<int>(
+                std::distance(x.begin(), std::min_element(x.begin(), x.end())));
         }
+        idx /= N_ITERS;
+        printf("min at idx %d (using std::min_element)\n", idx);
+        auto end = std::chrono::steady_clock::now();
+        printf("-> elapsed=%d\n", static_cast<int>((end - start).count()));
+    }
+    {
+        auto start = std::chrono::steady_clock::now();
+        int idx = 0;
+        for (int i = 0; i < N_ITERS; ++i)
         {
-            auto start = std::chrono::steady_clock::now();
-            int idx = 0;
-            for (int i = 0; i < N_ITERS; ++i)
-            {
-                idx += static_cast<int>(std::distance(
-                    x.begin(), std::min_element(x.begin(), x.end(),
-                                                [](T a, T b) {
-                                                    return b == 0   ? true
-                                                           : a == 0 ? false
-                                                                    : a < b;
-                                                })));
-            }
-            idx /= N_ITERS;
-            printf("min at idx %d (nodata case, using std::min_element with "
-                   "nodata aware comparison)\n",
-                   idx);
-            auto end = std::chrono::steady_clock::now();
-            printf("-> elapsed=%d\n", static_cast<int>((end - start).count()));
+            idx += static_cast<int>(
+                gdal::min_element(x.data(), x.size(), eDT, true, noData));
         }
+        idx /= N_ITERS;
+        printf("min at idx %d (nodata case, optimized)\n", idx);
+        auto end = std::chrono::steady_clock::now();
+        printf("-> elapsed=%d\n", static_cast<int>((end - start).count()));
+    }
+    {
+        auto start = std::chrono::steady_clock::now();
+        int idx = 0;
+        for (int i = 0; i < N_ITERS; ++i)
+        {
+            idx += static_cast<int>(std::distance(
+                x.begin(), std::min_element(x.begin(), x.end(),
+                                            [noData](T a, T b) {
+                                                return b == noData   ? true
+                                                       : a == noData ? false
+                                                                     : a < b;
+                                            })));
+        }
+        idx /= N_ITERS;
+        printf("min at idx %d (nodata case, using std::min_element with "
+               "nodata aware comparison)\n",
+               idx);
+        auto end = std::chrono::steady_clock::now();
+        printf("-> elapsed=%d\n", static_cast<int>((end - start).count()));
+    }
+}
+
+int main(int /* argc */, char * /* argv */[])
+{
+    {
+        using T = uint8_t;
+        constexpr GDALDataType eDT = GDT_Byte;
+        printf("uint8:\n");
+        benchIntegers<T>(eDT, 0);
+    }
+    printf("--------------------\n");
+    {
+        using T = int8_t;
+        constexpr GDALDataType eDT = GDT_Int8;
+        printf("int8:\n");
+        benchIntegers<T>(eDT, 0);
     }
     printf("--------------------\n");
     {
         using T = uint16_t;
         constexpr GDALDataType eDT = GDT_UInt16;
         printf("uint16:\n");
-        std::vector<T> x;
-        x.resize(SIZE);
-        randomFill(x.data(), x.size());
-        {
-            auto start = std::chrono::steady_clock::now();
-            int idx = 0;
-            for (int i = 0; i < N_ITERS; ++i)
-            {
-                idx += static_cast<int>(
-                    gdal::min_element(x.data(), x.size(), eDT, false, 0));
-            }
-            idx /= N_ITERS;
-            printf("min at idx %d (optimized)\n", idx);
-            auto end = std::chrono::steady_clock::now();
-            printf("-> elapsed=%d\n", static_cast<int>((end - start).count()));
-        }
-        {
-            auto start = std::chrono::steady_clock::now();
-            int idx = 0;
-            for (int i = 0; i < N_ITERS; ++i)
-            {
-                idx += static_cast<int>(std::distance(
-                    x.begin(), std::min_element(x.begin(), x.end())));
-            }
-            idx /= N_ITERS;
-            printf("min at idx %d (using std::min_element)\n", idx);
-            auto end = std::chrono::steady_clock::now();
-            printf("-> elapsed=%d\n", static_cast<int>((end - start).count()));
-        }
-        {
-            auto start = std::chrono::steady_clock::now();
-            int idx = 0;
-            for (int i = 0; i < N_ITERS; ++i)
-            {
-                idx += static_cast<int>(
-                    gdal::min_element(x.data(), x.size(), eDT, true, 0));
-            }
-            idx /= N_ITERS;
-            printf("min at idx %d (nodata case, optimized)\n", idx);
-            auto end = std::chrono::steady_clock::now();
-            printf("-> elapsed=%d\n", static_cast<int>((end - start).count()));
-        }
-        {
-            auto start = std::chrono::steady_clock::now();
-            int idx = 0;
-            for (int i = 0; i < N_ITERS; ++i)
-            {
-                idx += static_cast<int>(std::distance(
-                    x.begin(), std::min_element(x.begin(), x.end(),
-                                                [](T a, T b) {
-                                                    return b == 0   ? true
-                                                           : a == 0 ? false
-                                                                    : a < b;
-                                                })));
-            }
-            idx /= N_ITERS;
-            printf("min at idx %d (nodata case, using std::min_element with "
-                   "nodata aware comparison)\n",
-                   idx);
-            auto end = std::chrono::steady_clock::now();
-            printf("-> elapsed=%d\n", static_cast<int>((end - start).count()));
-        }
+        benchIntegers<T>(eDT, 0);
     }
     printf("--------------------\n");
     {
         using T = int16_t;
         constexpr GDALDataType eDT = GDT_Int16;
         printf("int16:\n");
-        std::vector<T> x;
-        x.resize(SIZE);
-        randomFill(x.data(), x.size());
-        {
-            auto start = std::chrono::steady_clock::now();
-            int idx = 0;
-            for (int i = 0; i < N_ITERS; ++i)
-            {
-                idx += static_cast<int>(
-                    gdal::min_element(x.data(), x.size(), eDT, false, 0));
-            }
-            idx /= N_ITERS;
-            printf("min at idx %d (optimized)\n", idx);
-            auto end = std::chrono::steady_clock::now();
-            printf("-> elapsed=%d\n", static_cast<int>((end - start).count()));
-        }
-        {
-            auto start = std::chrono::steady_clock::now();
-            int idx = 0;
-            for (int i = 0; i < N_ITERS; ++i)
-            {
-                idx += static_cast<int>(std::distance(
-                    x.begin(), std::min_element(x.begin(), x.end())));
-            }
-            idx /= N_ITERS;
-            printf("min at idx %d (using std::min_element)\n", idx);
-            auto end = std::chrono::steady_clock::now();
-            printf("-> elapsed=%d\n", static_cast<int>((end - start).count()));
-        }
-        {
-            auto start = std::chrono::steady_clock::now();
-            int idx = 0;
-            for (int i = 0; i < N_ITERS; ++i)
-            {
-                idx += static_cast<int>(
-                    gdal::min_element(x.data(), x.size(), eDT, true, 0));
-            }
-            idx /= N_ITERS;
-            printf("min at idx %d (nodata case, optimized)\n", idx);
-            auto end = std::chrono::steady_clock::now();
-            printf("-> elapsed=%d\n", static_cast<int>((end - start).count()));
-        }
-        {
-            auto start = std::chrono::steady_clock::now();
-            int idx = 0;
-            for (int i = 0; i < N_ITERS; ++i)
-            {
-                idx += static_cast<int>(std::distance(
-                    x.begin(), std::min_element(x.begin(), x.end(),
-                                                [](T a, T b) {
-                                                    return b == 0   ? true
-                                                           : a == 0 ? false
-                                                                    : a < b;
-                                                })));
-            }
-            idx /= N_ITERS;
-            printf("min at idx %d (nodata case, using std::min_element with "
-                   "nodata aware comparison)\n",
-                   idx);
-            auto end = std::chrono::steady_clock::now();
-            printf("-> elapsed=%d\n", static_cast<int>((end - start).count()));
-        }
+        benchIntegers<T>(eDT, 0);
     }
     printf("--------------------\n");
     {
         using T = uint32_t;
         constexpr GDALDataType eDT = GDT_UInt32;
         printf("uint32:\n");
-        std::vector<T> x;
-        x.resize(SIZE);
-        randomFill(x.data(), x.size());
-        {
-            auto start = std::chrono::steady_clock::now();
-            int idx = 0;
-            for (int i = 0; i < N_ITERS; ++i)
-            {
-                idx += static_cast<int>(
-                    gdal::min_element(x.data(), x.size(), eDT, false, 0));
-            }
-            idx /= N_ITERS;
-            printf("min at idx %d (optimized)\n", idx);
-            auto end = std::chrono::steady_clock::now();
-            printf("-> elapsed=%d\n", static_cast<int>((end - start).count()));
-        }
-        {
-            auto start = std::chrono::steady_clock::now();
-            int idx = 0;
-            for (int i = 0; i < N_ITERS; ++i)
-            {
-                idx += static_cast<int>(std::distance(
-                    x.begin(), std::min_element(x.begin(), x.end())));
-            }
-            idx /= N_ITERS;
-            printf("min at idx %d (using std::min_element)\n", idx);
-            auto end = std::chrono::steady_clock::now();
-            printf("-> elapsed=%d\n", static_cast<int>((end - start).count()));
-        }
-        {
-            auto start = std::chrono::steady_clock::now();
-            int idx = 0;
-            for (int i = 0; i < N_ITERS; ++i)
-            {
-                idx += static_cast<int>(
-                    gdal::min_element(x.data(), x.size(), eDT, true, 0));
-            }
-            idx /= N_ITERS;
-            printf("min at idx %d (nodata case, optimized)\n", idx);
-            auto end = std::chrono::steady_clock::now();
-            printf("-> elapsed=%d\n", static_cast<int>((end - start).count()));
-        }
-        {
-            auto start = std::chrono::steady_clock::now();
-            int idx = 0;
-            for (int i = 0; i < N_ITERS; ++i)
-            {
-                idx += static_cast<int>(std::distance(
-                    x.begin(), std::min_element(x.begin(), x.end(),
-                                                [](T a, T b) {
-                                                    return b == 0   ? true
-                                                           : a == 0 ? false
-                                                                    : a < b;
-                                                })));
-            }
-            idx /= N_ITERS;
-            printf("min at idx %d (nodata case, using std::min_element with "
-                   "nodata aware comparison)\n",
-                   idx);
-            auto end = std::chrono::steady_clock::now();
-            printf("-> elapsed=%d\n", static_cast<int>((end - start).count()));
-        }
+        benchIntegers<T>(eDT, 0);
     }
     printf("--------------------\n");
     {
         using T = int32_t;
         constexpr GDALDataType eDT = GDT_Int32;
         printf("int32:\n");
-        std::vector<T> x;
-        x.resize(SIZE);
-        randomFill(x.data(), x.size());
-        {
-            auto start = std::chrono::steady_clock::now();
-            int idx = 0;
-            for (int i = 0; i < N_ITERS; ++i)
-            {
-                idx += static_cast<int>(
-                    gdal::min_element(x.data(), x.size(), eDT, false, 0));
-            }
-            idx /= N_ITERS;
-            printf("min at idx %d (optimized)\n", idx);
-            auto end = std::chrono::steady_clock::now();
-            printf("-> elapsed=%d\n", static_cast<int>((end - start).count()));
-        }
-        {
-            auto start = std::chrono::steady_clock::now();
-            int idx = 0;
-            for (int i = 0; i < N_ITERS; ++i)
-            {
-                idx += static_cast<int>(std::distance(
-                    x.begin(), std::min_element(x.begin(), x.end())));
-            }
-            idx /= N_ITERS;
-            printf("min at idx %d (using std::min_element)\n", idx);
-            auto end = std::chrono::steady_clock::now();
-            printf("-> elapsed=%d\n", static_cast<int>((end - start).count()));
-        }
-        {
-            auto start = std::chrono::steady_clock::now();
-            int idx = 0;
-            for (int i = 0; i < N_ITERS; ++i)
-            {
-                idx += static_cast<int>(
-                    gdal::min_element(x.data(), x.size(), eDT, true, 0));
-            }
-            idx /= N_ITERS;
-            printf("min at idx %d (nodata case, optimized)\n", idx);
-            auto end = std::chrono::steady_clock::now();
-            printf("-> elapsed=%d\n", static_cast<int>((end - start).count()));
-        }
-        {
-            auto start = std::chrono::steady_clock::now();
-            int idx = 0;
-            for (int i = 0; i < N_ITERS; ++i)
-            {
-                idx += static_cast<int>(std::distance(
-                    x.begin(), std::min_element(x.begin(), x.end(),
-                                                [](T a, T b) {
-                                                    return b == 0   ? true
-                                                           : a == 0 ? false
-                                                                    : a < b;
-                                                })));
-            }
-            idx /= N_ITERS;
-            printf("min at idx %d (nodata case, using std::min_element with "
-                   "nodata aware comparison)\n",
-                   idx);
-            auto end = std::chrono::steady_clock::now();
-            printf("-> elapsed=%d\n", static_cast<int>((end - start).count()));
-        }
+        benchIntegers<T>(eDT, 0);
     }
     printf("--------------------\n");
     {
         using T = float;
         constexpr GDALDataType eDT = GDT_Float32;
         printf("float (*with* NaN):\n");
-        std::vector<T> x;
-        x.resize(SIZE);
-        randomFill(x.data(), x.size());
-        {
-            auto start = std::chrono::steady_clock::now();
-            int idx = 0;
-            for (int i = 0; i < N_ITERS; ++i)
-            {
-                idx += static_cast<int>(
-                    gdal::min_element(x.data(), x.size(), eDT, false, 0));
-            }
-            idx /= N_ITERS;
-            printf("min at idx %d (optimized)\n", idx);
-            auto end = std::chrono::steady_clock::now();
-            printf("-> elapsed=%d\n", static_cast<int>((end - start).count()));
-        }
-        {
-            auto start = std::chrono::steady_clock::now();
-            int idx = 0;
-            for (int i = 0; i < N_ITERS; ++i)
-            {
-                idx += static_cast<int>(std::distance(
-                    x.begin(), std::min_element(x.begin(), x.end(),
-                                                [](T a, T b) {
-                                                    return std::isnan(b) ? true
-                                                           : std::isnan(a)
-                                                               ? false
-                                                               : a < b;
-                                                })));
-            }
-            idx /= N_ITERS;
-            printf("min at idx %d (using std::min_element with NaN aware "
-                   "comparison)\n",
-                   idx);
-            auto end = std::chrono::steady_clock::now();
-            printf("-> elapsed=%d\n", static_cast<int>((end - start).count()));
-        }
-        {
-            auto start = std::chrono::steady_clock::now();
-            int idx = 0;
-            for (int i = 0; i < N_ITERS; ++i)
-            {
-                idx += static_cast<int>(
-                    gdal::min_element(x.data(), x.size(), eDT, true, 0));
-            }
-            idx /= N_ITERS;
-            printf("min at idx %d (nodata case, optimized)\n", idx);
-            auto end = std::chrono::steady_clock::now();
-            printf("-> elapsed=%d\n", static_cast<int>((end - start).count()));
-        }
-        {
-            auto start = std::chrono::steady_clock::now();
-            int idx = 0;
-            for (int i = 0; i < N_ITERS; ++i)
-            {
-                idx += static_cast<int>(std::distance(
-                    x.begin(), std::min_element(x.begin(), x.end(),
-                                                [](T a, T b)
-                                                {
-                                                    return std::isnan(b) ? true
-                                                           : std::isnan(a)
-                                                               ? false
-                                                           : b == 0 ? true
-                                                           : a == 0 ? false
-                                                                    : a < b;
-                                                })));
-            }
-            idx /= N_ITERS;
-            printf("min at idx %d (nodata case, using std::min_element with "
-                   "nodata aware and NaN aware comparison)\n",
-                   idx);
-            auto end = std::chrono::steady_clock::now();
-            printf("-> elapsed=%d\n", static_cast<int>((end - start).count()));
-        }
+        benchFloatingPointsWithNaN<T>(eDT, 0);
     }
     printf("--------------------\n");
     {
         using T = float;
         constexpr GDALDataType eDT = GDT_Float32;
         printf("float (without NaN):\n");
-        std::vector<T> x;
-        x.resize(SIZE);
-        randomFill(x.data(), x.size(), false);
-        {
-            auto start = std::chrono::steady_clock::now();
-            int idx = 0;
-            for (int i = 0; i < N_ITERS; ++i)
-            {
-                idx += static_cast<int>(
-                    gdal::min_element(x.data(), x.size(), eDT, false, 0));
-            }
-            idx /= N_ITERS;
-            printf("min at idx %d (optimized)\n", idx);
-            auto end = std::chrono::steady_clock::now();
-            printf("-> elapsed=%d\n", static_cast<int>((end - start).count()));
-        }
-        {
-            auto start = std::chrono::steady_clock::now();
-            int idx = 0;
-            for (int i = 0; i < N_ITERS; ++i)
-            {
-                idx += static_cast<int>(std::distance(
-                    x.begin(), std::min_element(x.begin(), x.end())));
-            }
-            idx /= N_ITERS;
-            printf("min at idx %d (using std::min_element)\n", idx);
-            auto end = std::chrono::steady_clock::now();
-            printf("-> elapsed=%d\n", static_cast<int>((end - start).count()));
-        }
-        {
-            auto start = std::chrono::steady_clock::now();
-            int idx = 0;
-            for (int i = 0; i < N_ITERS; ++i)
-            {
-                idx += static_cast<int>(
-                    gdal::min_element(x.data(), x.size(), eDT, true, 0));
-            }
-            idx /= N_ITERS;
-            printf("min at idx %d (nodata case, optimized)\n", idx);
-            auto end = std::chrono::steady_clock::now();
-            printf("-> elapsed=%d\n", static_cast<int>((end - start).count()));
-        }
-        {
-            auto start = std::chrono::steady_clock::now();
-            int idx = 0;
-            for (int i = 0; i < N_ITERS; ++i)
-            {
-                idx += static_cast<int>(std::distance(
-                    x.begin(), std::min_element(x.begin(), x.end(),
-                                                [](T a, T b) {
-                                                    return b == 0   ? true
-                                                           : a == 0 ? false
-                                                                    : a < b;
-                                                })));
-            }
-            idx /= N_ITERS;
-            printf("min at idx %d (nodata case, using std::min_element with "
-                   "nodata aware comparison)\n",
-                   idx);
-            auto end = std::chrono::steady_clock::now();
-            printf("-> elapsed=%d\n", static_cast<int>((end - start).count()));
-        }
+        benchFloatingPointsWithoutNaN<T>(eDT, 0);
     }
     printf("--------------------\n");
     {
         using T = double;
         constexpr GDALDataType eDT = GDT_Float64;
         printf("double (*with* NaN):\n");
-        std::vector<T> x;
-        x.resize(SIZE);
-        randomFill(x.data(), x.size());
-        {
-            auto start = std::chrono::steady_clock::now();
-            int idx = 0;
-            for (int i = 0; i < N_ITERS; ++i)
-            {
-                idx += static_cast<int>(
-                    gdal::min_element(x.data(), x.size(), eDT, false, 0));
-            }
-            idx /= N_ITERS;
-            printf("min at idx %d (optimized)\n", idx);
-            auto end = std::chrono::steady_clock::now();
-            printf("-> elapsed=%d\n", static_cast<int>((end - start).count()));
-        }
-        {
-            auto start = std::chrono::steady_clock::now();
-            int idx = 0;
-            for (int i = 0; i < N_ITERS; ++i)
-            {
-                idx += static_cast<int>(std::distance(
-                    x.begin(), std::min_element(x.begin(), x.end(),
-                                                [](T a, T b) {
-                                                    return std::isnan(b) ? true
-                                                           : std::isnan(a)
-                                                               ? false
-                                                               : a < b;
-                                                })));
-            }
-            idx /= N_ITERS;
-            printf("min at idx %d (using std::min_element with NaN aware "
-                   "comparison)\n",
-                   idx);
-            auto end = std::chrono::steady_clock::now();
-            printf("-> elapsed=%d\n", static_cast<int>((end - start).count()));
-        }
-        {
-            auto start = std::chrono::steady_clock::now();
-            int idx = 0;
-            for (int i = 0; i < N_ITERS; ++i)
-            {
-                idx += static_cast<int>(
-                    gdal::min_element(x.data(), x.size(), eDT, true, 0));
-            }
-            idx /= N_ITERS;
-            printf("min at idx %d (nodata case, optimized)\n", idx);
-            auto end = std::chrono::steady_clock::now();
-            printf("-> elapsed=%d\n", static_cast<int>((end - start).count()));
-        }
-        {
-            auto start = std::chrono::steady_clock::now();
-            int idx = 0;
-            for (int i = 0; i < N_ITERS; ++i)
-            {
-                idx += static_cast<int>(std::distance(
-                    x.begin(), std::min_element(x.begin(), x.end(),
-                                                [](T a, T b)
-                                                {
-                                                    return std::isnan(b) ? true
-                                                           : std::isnan(a)
-                                                               ? false
-                                                           : b == 0 ? true
-                                                           : a == 0 ? false
-                                                                    : a < b;
-                                                })));
-            }
-            idx /= N_ITERS;
-            printf("min at idx %d (nodata case, using std::min_element with "
-                   "nodata aware and NaN aware comparison)\n",
-                   idx);
-            auto end = std::chrono::steady_clock::now();
-            printf("-> elapsed=%d\n", static_cast<int>((end - start).count()));
-        }
+        benchFloatingPointsWithNaN<T>(eDT, 0);
     }
     printf("--------------------\n");
     {
         using T = double;
         constexpr GDALDataType eDT = GDT_Float64;
         printf("double (without NaN):\n");
-        std::vector<T> x;
-        x.resize(SIZE);
-        randomFill(x.data(), x.size(), false);
-        {
-            auto start = std::chrono::steady_clock::now();
-            int idx = 0;
-            for (int i = 0; i < N_ITERS; ++i)
-            {
-                idx += static_cast<int>(
-                    gdal::min_element(x.data(), x.size(), eDT, false, 0));
-            }
-            idx /= N_ITERS;
-            printf("min at idx %d (optimized)\n", idx);
-            auto end = std::chrono::steady_clock::now();
-            printf("-> elapsed=%d\n", static_cast<int>((end - start).count()));
-        }
-        {
-            auto start = std::chrono::steady_clock::now();
-            int idx = 0;
-            for (int i = 0; i < N_ITERS; ++i)
-            {
-                idx += static_cast<int>(std::distance(
-                    x.begin(), std::min_element(x.begin(), x.end())));
-            }
-            idx /= N_ITERS;
-            printf("min at idx %d (using std::min_element)\n", idx);
-            auto end = std::chrono::steady_clock::now();
-            printf("-> elapsed=%d\n", static_cast<int>((end - start).count()));
-        }
-        {
-            auto start = std::chrono::steady_clock::now();
-            int idx = 0;
-            for (int i = 0; i < N_ITERS; ++i)
-            {
-                idx += static_cast<int>(
-                    gdal::min_element(x.data(), x.size(), eDT, true, 0));
-            }
-            idx /= N_ITERS;
-            printf("min at idx %d (nodata case, optimized)\n", idx);
-            auto end = std::chrono::steady_clock::now();
-            printf("-> elapsed=%d\n", static_cast<int>((end - start).count()));
-        }
-        {
-            auto start = std::chrono::steady_clock::now();
-            int idx = 0;
-            for (int i = 0; i < N_ITERS; ++i)
-            {
-                idx += static_cast<int>(std::distance(
-                    x.begin(), std::min_element(x.begin(), x.end(),
-                                                [](T a, T b) {
-                                                    return b == 0   ? true
-                                                           : a == 0 ? false
-                                                                    : a < b;
-                                                })));
-            }
-            idx /= N_ITERS;
-            printf("min at idx %d (nodata case, using std::min_element with "
-                   "nodata aware comparison)\n",
-                   idx);
-            auto end = std::chrono::steady_clock::now();
-            printf("-> elapsed=%d\n", static_cast<int>((end - start).count()));
-        }
+        benchFloatingPointsWithoutNaN<T>(eDT, 0);
     }
     return 0;
 }

From 8ca137613bee46887380bb1b68a5307429f18f61 Mon Sep 17 00:00:00 2001
From: Even Rouault <even.rouault@spatialys.com>
Date: Sun, 3 Nov 2024 22:16:47 +0100
Subject: [PATCH 15/62] Add testperf_gdal_minmax_element as a ctest target

---
 perftests/CMakeLists.txt | 19 ++++++++++++++++---
 1 file changed, 16 insertions(+), 3 deletions(-)

diff --git a/perftests/CMakeLists.txt b/perftests/CMakeLists.txt
index e020f471d2b9..1c7546dcef97 100644
--- a/perftests/CMakeLists.txt
+++ b/perftests/CMakeLists.txt
@@ -1,5 +1,18 @@
 include(GdalTestTarget)
 
+include(GdalSetRuntimeEnv)
+gdal_set_runtime_env(TEST_ENV)
+
+if (MINGW)
+  list(APPEND TEST_ENV SKIP_MEM_INTENSIVE_TEST=YES)
+endif ()
+
+if (WIN32)
+  # If running GDAL as a CustomBuild Command os MSBuild, "ERROR bla:" is considered as failing the job. This is rarely
+  # the intended behavior
+  list(APPEND TEST_ENV "CPL_ERROR_SEPARATOR=\\;")
+endif ()
+
 gdal_test_target(testperfcopywords testperfcopywords.cpp)
 gdal_test_target(testperfdeinterleave testperfdeinterleave.cpp)
 
@@ -11,6 +24,6 @@ add_executable(bench_ogr_c_api bench_ogr_c_api.cpp)
 gdal_standard_includes(bench_ogr_c_api)
 target_link_libraries(bench_ogr_c_api PRIVATE $<TARGET_NAME:${GDAL_LIB_TARGET_NAME}>)
 
-add_executable(testperf_gdal_minmax_element testperf_gdal_minmax_element.cpp)
-gdal_standard_includes(testperf_gdal_minmax_element)
-target_link_libraries(testperf_gdal_minmax_element PRIVATE $<TARGET_NAME:${GDAL_LIB_TARGET_NAME}>)
+gdal_test_target(testperf_gdal_minmax_element testperf_gdal_minmax_element.cpp)
+add_test(NAME testperf_gdal_minmax_element COMMAND testperf_gdal_minmax_element)
+set_property(TEST testperf_gdal_minmax_element PROPERTY ENVIRONMENT "${TEST_ENV}")

From db65ab5359ca5e37d2f88679ccb49a8a776ed18e Mon Sep 17 00:00:00 2001
From: Even Rouault <even.rouault@spatialys.com>
Date: Mon, 4 Nov 2024 00:39:53 +0100
Subject: [PATCH 16/62] gdal_minmax_element.hpp: fallback to
 std::min_element/max_element in the non-SSE2 case, as it turns out at least
 on Apple Silicon that our 'optimized' version is generally slower

---
 autotest/cpp/test_gdal_minmax_element.cpp |   2 +-
 gcore/gdal_minmax_element.hpp             | 146 ++++++++++++++++++++++
 2 files changed, 147 insertions(+), 1 deletion(-)

diff --git a/autotest/cpp/test_gdal_minmax_element.cpp b/autotest/cpp/test_gdal_minmax_element.cpp
index f616fc032e45..b6d681dda76d 100644
--- a/autotest/cpp/test_gdal_minmax_element.cpp
+++ b/autotest/cpp/test_gdal_minmax_element.cpp
@@ -102,7 +102,7 @@ TEST_F(test_gdal_minmax_element, uint8)
     {
         std::vector<T> v(257, 0);
         auto idx_min = gdal::min_element(v.data(), v.size(), eDT, true, 0);
-        EXPECT_EQ(idx_min, 0);
+        EXPECT_TRUE(idx_min == 0 || idx_min == 256) << idx_min;
     }
     {
         std::vector<T> v(257, 0);
diff --git a/gcore/gdal_minmax_element.hpp b/gcore/gdal_minmax_element.hpp
index 62b3dee7e9da..0da63649a06d 100644
--- a/gcore/gdal_minmax_element.hpp
+++ b/gcore/gdal_minmax_element.hpp
@@ -72,6 +72,7 @@ namespace GDAL_MINMAXELT_NS
 namespace detail
 {
 
+#ifdef GDAL_MINMAX_ELEMENT_USE_SSE2
 /************************************************************************/
 /*                            compScalar()                              */
 /************************************************************************/
@@ -851,6 +852,137 @@ inline size_t extremum_element(const T *buffer, size_t size, bool bHasNoData,
         return extremum_element<T, IS_MAX>(buffer, size);
 }
 
+#else
+
+template <class T, bool IS_MAX>
+inline size_t extremum_element(const T *buffer, size_t size, bool bHasNoData,
+                               T noDataValue)
+{
+    if (bHasNoData)
+    {
+        if constexpr (std::is_floating_point_v<T>)
+        {
+            if (std::isnan(noDataValue))
+            {
+                if constexpr (IS_MAX)
+                {
+                    return std::max_element(buffer, buffer + size,
+                                            [](T a, T b) {
+                                                return std::isnan(b)   ? false
+                                                       : std::isnan(a) ? true
+                                                                       : a < b;
+                                            }) -
+                           buffer;
+                }
+                else
+                {
+                    return std::min_element(buffer, buffer + size,
+                                            [](T a, T b) {
+                                                return std::isnan(b)   ? true
+                                                       : std::isnan(a) ? false
+                                                                       : a < b;
+                                            }) -
+                           buffer;
+                }
+            }
+            else
+            {
+                if constexpr (IS_MAX)
+                {
+                    return std::max_element(buffer, buffer + size,
+                                            [noDataValue](T a, T b)
+                                            {
+                                                return std::isnan(b)   ? false
+                                                       : std::isnan(a) ? true
+                                                       : (b == noDataValue)
+                                                           ? false
+                                                       : (a == noDataValue)
+                                                           ? true
+                                                           : a < b;
+                                            }) -
+                           buffer;
+                }
+                else
+                {
+                    return std::min_element(buffer, buffer + size,
+                                            [noDataValue](T a, T b)
+                                            {
+                                                return std::isnan(b)   ? true
+                                                       : std::isnan(a) ? false
+                                                       : (b == noDataValue)
+                                                           ? true
+                                                       : (a == noDataValue)
+                                                           ? false
+                                                           : a < b;
+                                            }) -
+                           buffer;
+                }
+            }
+        }
+        else
+        {
+            if constexpr (IS_MAX)
+            {
+                return std::max_element(buffer, buffer + size,
+                                        [noDataValue](T a, T b) {
+                                            return (b == noDataValue)   ? false
+                                                   : (a == noDataValue) ? true
+                                                                        : a < b;
+                                        }) -
+                       buffer;
+            }
+            else
+            {
+                return std::min_element(buffer, buffer + size,
+                                        [noDataValue](T a, T b) {
+                                            return (b == noDataValue)   ? true
+                                                   : (a == noDataValue) ? false
+                                                                        : a < b;
+                                        }) -
+                       buffer;
+            }
+        }
+    }
+    else
+    {
+        if constexpr (std::is_floating_point_v<T>)
+        {
+            if constexpr (IS_MAX)
+            {
+                return std::max_element(buffer, buffer + size,
+                                        [](T a, T b) {
+                                            return std::isnan(b)   ? false
+                                                   : std::isnan(a) ? true
+                                                                   : a < b;
+                                        }) -
+                       buffer;
+            }
+            else
+            {
+                return std::min_element(buffer, buffer + size,
+                                        [](T a, T b) {
+                                            return std::isnan(b)   ? true
+                                                   : std::isnan(a) ? false
+                                                                   : a < b;
+                                        }) -
+                       buffer;
+            }
+        }
+        else
+        {
+            if constexpr (IS_MAX)
+            {
+                return std::max_element(buffer, buffer + size) - buffer;
+            }
+            else
+            {
+                return std::min_element(buffer, buffer + size) - buffer;
+            }
+        }
+    }
+}
+#endif
+
 template <bool IS_MAX>
 size_t extremum_element(const void *buffer, size_t nElts, GDALDataType eDT,
                         bool bHasNoData, double dfNoDataValue)
@@ -1149,11 +1281,25 @@ inline std::pair<size_t, size_t> minmax_element(const T *buffer, size_t size,
         //return std::pair(imin - buffer, imax - buffer);
     }
 #else
+
+#if !defined(GDAL_MINMAX_ELEMENT_USE_SSE2)
+    if constexpr (!std::is_floating_point_v<T>)
+    {
+        if (!bHasNoData)
+        {
+            auto [min_iter, max_iter] =
+                std::minmax_element(buffer, buffer + size);
+            return std::pair(min_iter - buffer, max_iter - buffer);
+        }
+    }
+#endif
+
     // Using separately min and max is more efficient than computing them
     // within the same loop
     return std::pair(
         extremum_element<T, false>(buffer, size, bHasNoData, noDataValue),
         extremum_element<T, true>(buffer, size, bHasNoData, noDataValue));
+
 #endif
 }
 #endif

From 14c25cd62a3cadd755e5ceb75d8c9c15c4a13a06 Mon Sep 17 00:00:00 2001
From: Even Rouault <even.rouault@spatialys.com>
Date: Sun, 3 Nov 2024 22:39:24 +0100
Subject: [PATCH 17/62] CI ARM64: run cpp perf tests

---
 ci/travis/osx/script.sh | 3 +++
 1 file changed, 3 insertions(+)

diff --git a/ci/travis/osx/script.sh b/ci/travis/osx/script.sh
index a28b9eeac6be..7b212dfc9deb 100755
--- a/ci/travis/osx/script.sh
+++ b/ci/travis/osx/script.sh
@@ -7,6 +7,9 @@ export PROJ_NETWORK=ON
 echo 'Running CPP unit tests'
 (cd build && make quicktest)
 
+echo 'Running CPP perftests'
+(cd build && ctest -V -R perf)
+
 echo 'Running Python unit tests'
 # install test dependencies
 sudo -H pip3 install -r autotest/requirements.txt

From 58d4d3ea790271f30789457f90752a8677edcc73 Mon Sep 17 00:00:00 2001
From: Even Rouault <even.rouault@spatialys.com>
Date: Sun, 3 Nov 2024 12:49:04 +0100
Subject: [PATCH 18/62] Add GDALRasterComputeMinMaxLocation /
 GDALRasterBand::ComputeMinMaxLocation, and map it to SWIG

and add a gdal_minmax_location.py script:

```
$ python ../swig/python/gdal-utils/osgeo_utils/samples/gdal_minmax_location.py byte.tif
Minimum=74.0 at (col,line)=(9,17), (X,Y)_georef=(441290.0,3750270.0), (long,lat)_WGS84=(-117.6358076,33.8929309)
Maximum=255.0 at (col,line)=(2,18), (X,Y)_georef=(440870.0,3750210.0), (long,lat)_WGS84=(-117.6403456,33.8923662)
```
---
 autotest/cpp/test_gdal.cpp                    | 139 +++++++++
 autotest/gcore/basic_test.py                  |  19 ++
 doc/source/api/python_samples.rst             |   1 +
 gcore/gdal.h                                  |   4 +
 gcore/gdal_priv.h                             |   3 +
 gcore/gdalrasterband.cpp                      | 283 ++++++++++++++++++
 swig/include/Band.i                           |  17 ++
 swig/include/python/gdal_python.i             |  44 +++
 .../samples/gdal_minmax_location.py           | 128 ++++++++
 9 files changed, 638 insertions(+)
 create mode 100644 swig/python/gdal-utils/osgeo_utils/samples/gdal_minmax_location.py

diff --git a/autotest/cpp/test_gdal.cpp b/autotest/cpp/test_gdal.cpp
index 45ebfb4ab870..9be606d82c6b 100644
--- a/autotest/cpp/test_gdal.cpp
+++ b/autotest/cpp/test_gdal.cpp
@@ -4777,4 +4777,143 @@ TEST_F(test_gdal, ReadRaster)
     }
 }
 
+// Test GDALComputeRasterMinMaxLocation
+TEST_F(test_gdal, GDALComputeRasterMinMaxLocation)
+{
+    GDALDatasetH hDS = GDALOpen(GCORE_DATA_DIR "byte.tif", GA_ReadOnly);
+    ASSERT_NE(hDS, nullptr);
+    GDALRasterBandH hBand = GDALGetRasterBand(hDS, 1);
+    {
+        double dfMin = 0;
+        double dfMax = 0;
+        int nMinX = -1;
+        int nMinY = -1;
+        int nMaxX = -1;
+        int nMaxY = -1;
+        EXPECT_EQ(GDALComputeRasterMinMaxLocation(hBand, &dfMin, &dfMax, &nMinX,
+                                                  &nMinY, &nMaxX, &nMaxY),
+                  CE_None);
+        EXPECT_EQ(dfMin, 74.0);
+        EXPECT_EQ(dfMax, 255.0);
+        EXPECT_EQ(nMinX, 9);
+        EXPECT_EQ(nMinY, 17);
+        EXPECT_EQ(nMaxX, 2);
+        EXPECT_EQ(nMaxY, 18);
+        GByte val = 0;
+        EXPECT_EQ(GDALRasterIO(hBand, GF_Read, nMinX, nMinY, 1, 1, &val, 1, 1,
+                               GDT_Byte, 0, 0),
+                  CE_None);
+        EXPECT_EQ(val, 74);
+        EXPECT_EQ(GDALRasterIO(hBand, GF_Read, nMaxX, nMaxY, 1, 1, &val, 1, 1,
+                               GDT_Byte, 0, 0),
+                  CE_None);
+        EXPECT_EQ(val, 255);
+    }
+    {
+        int nMinX = -1;
+        int nMinY = -1;
+        EXPECT_EQ(GDALComputeRasterMinMaxLocation(hBand, nullptr, nullptr,
+                                                  &nMinX, &nMinY, nullptr,
+                                                  nullptr),
+                  CE_None);
+        EXPECT_EQ(nMinX, 9);
+        EXPECT_EQ(nMinY, 17);
+    }
+    {
+        int nMaxX = -1;
+        int nMaxY = -1;
+        EXPECT_EQ(GDALComputeRasterMinMaxLocation(hBand, nullptr, nullptr,
+                                                  nullptr, nullptr, &nMaxX,
+                                                  &nMaxY),
+                  CE_None);
+        EXPECT_EQ(nMaxX, 2);
+        EXPECT_EQ(nMaxY, 18);
+    }
+    {
+        EXPECT_EQ(GDALComputeRasterMinMaxLocation(hBand, nullptr, nullptr,
+                                                  nullptr, nullptr, nullptr,
+                                                  nullptr),
+                  CE_None);
+    }
+    GDALClose(hDS);
+}
+
+// Test GDALComputeRasterMinMaxLocation
+TEST_F(test_gdal, GDALComputeRasterMinMaxLocation_byte_min_max_optim)
+{
+    GDALDatasetUniquePtr poDS(GDALDriver::FromHandle(GDALGetDriverByName("MEM"))
+                                  ->Create("", 1, 4, 1, GDT_Byte, nullptr));
+    std::array<uint8_t, 4> buffer = {
+        1,    //////////////////////////////////////////////////////////
+        0,    //////////////////////////////////////////////////////////
+        255,  //////////////////////////////////////////////////////////
+        1,    //////////////////////////////////////////////////////////
+    };
+    GDALRasterIOExtraArg sExtraArg;
+    INIT_RASTERIO_EXTRA_ARG(sExtraArg);
+    EXPECT_EQ(poDS->GetRasterBand(1)->RasterIO(
+                  GF_Write, 0, 0, 1, 4, buffer.data(), 1, 4, GDT_Byte,
+                  sizeof(uint8_t), 1 * sizeof(uint8_t), &sExtraArg),
+              CE_None);
+
+    double dfMin = 0;
+    double dfMax = 0;
+    int nMinX = -1;
+    int nMinY = -1;
+    int nMaxX = -1;
+    int nMaxY = -1;
+    EXPECT_EQ(poDS->GetRasterBand(1)->ComputeRasterMinMaxLocation(
+                  &dfMin, &dfMax, &nMinX, &nMinY, &nMaxX, &nMaxY),
+              CE_None);
+    EXPECT_EQ(dfMin, 0);
+    EXPECT_EQ(dfMax, 255);
+    EXPECT_EQ(nMinX, 0);
+    EXPECT_EQ(nMinY, 1);
+    EXPECT_EQ(nMaxX, 0);
+    EXPECT_EQ(nMaxY, 2);
+}
+
+// Test GDALComputeRasterMinMaxLocation
+TEST_F(test_gdal, GDALComputeRasterMinMaxLocation_with_mask)
+{
+    GDALDatasetUniquePtr poDS(GDALDriver::FromHandle(GDALGetDriverByName("MEM"))
+                                  ->Create("", 2, 2, 1, GDT_Byte, nullptr));
+    std::array<uint8_t, 6> buffer = {
+        2, 10,  //////////////////////////////////////////////////////////
+        4, 20,  //////////////////////////////////////////////////////////
+    };
+    GDALRasterIOExtraArg sExtraArg;
+    INIT_RASTERIO_EXTRA_ARG(sExtraArg);
+    EXPECT_EQ(poDS->GetRasterBand(1)->RasterIO(
+                  GF_Write, 0, 0, 2, 2, buffer.data(), 2, 2, GDT_Byte,
+                  sizeof(uint8_t), 2 * sizeof(uint8_t), &sExtraArg),
+              CE_None);
+
+    poDS->GetRasterBand(1)->CreateMaskBand(0);
+    std::array<uint8_t, 6> buffer_mask = {
+        0, 255,  //////////////////////////////////////////////////////////
+        255, 0,  //////////////////////////////////////////////////////////
+    };
+    EXPECT_EQ(poDS->GetRasterBand(1)->GetMaskBand()->RasterIO(
+                  GF_Write, 0, 0, 2, 2, buffer_mask.data(), 2, 2, GDT_Byte,
+                  sizeof(uint8_t), 2 * sizeof(uint8_t), &sExtraArg),
+              CE_None);
+
+    double dfMin = 0;
+    double dfMax = 0;
+    int nMinX = -1;
+    int nMinY = -1;
+    int nMaxX = -1;
+    int nMaxY = -1;
+    EXPECT_EQ(poDS->GetRasterBand(1)->ComputeRasterMinMaxLocation(
+                  &dfMin, &dfMax, &nMinX, &nMinY, &nMaxX, &nMaxY),
+              CE_None);
+    EXPECT_EQ(dfMin, 4);
+    EXPECT_EQ(dfMax, 10);
+    EXPECT_EQ(nMinX, 0);
+    EXPECT_EQ(nMinY, 1);
+    EXPECT_EQ(nMaxX, 1);
+    EXPECT_EQ(nMaxY, 0);
+}
+
 }  // namespace
diff --git a/autotest/gcore/basic_test.py b/autotest/gcore/basic_test.py
index 3b399d820925..33f9e58a36ab 100755
--- a/autotest/gcore/basic_test.py
+++ b/autotest/gcore/basic_test.py
@@ -987,3 +987,22 @@ def test_colorinterp():
         assert name not in d
         d[name] = c
         assert gdal.GetColorInterpretationByName(name) == c
+
+
+def test_ComputeMinMaxLocation():
+
+    ds = gdal.Open("data/byte.tif")
+    ret = ds.GetRasterBand(1).ComputeMinMaxLocation()
+    assert (
+        ret.min == 74
+        and ret.max == 255
+        and ret.minX == 9
+        and ret.minY == 17
+        and ret.maxX == 2
+        and ret.maxY == 18
+    )
+
+    ds = gdal.GetDriverByName("MEM").Create("", 1, 1, 1, gdal.GDT_Float64)
+    ds.GetRasterBand(1).Fill(float("nan"))
+    ret = ds.GetRasterBand(1).ComputeMinMaxLocation()
+    assert ret is None
diff --git a/doc/source/api/python_samples.rst b/doc/source/api/python_samples.rst
index a2831f63ffaa..294a2bce48c4 100644
--- a/doc/source/api/python_samples.rst
+++ b/doc/source/api/python_samples.rst
@@ -47,6 +47,7 @@ Python Raster Sample scripts
     - hsv_merge: Merge greyscale image into RGB image as intensity in HSV space.
     - gdal_ls: Display the list of files in a virtual directory, like /vsicurl or /vsizip
     - gdal_cp: Copy a virtual file
+    - gdal_minmax_location: returns the location where min/max values of a raster are hit.
 
 Python Vector Sample scripts
 ------------------------------
diff --git a/gcore/gdal.h b/gcore/gdal.h
index f7a411c393cb..8638f8501b27 100644
--- a/gcore/gdal.h
+++ b/gcore/gdal.h
@@ -1670,6 +1670,10 @@ CPLErr CPL_DLL CPL_STDCALL GDALSetRasterScale(GDALRasterBandH hBand,
 CPLErr CPL_DLL CPL_STDCALL GDALComputeRasterMinMax(GDALRasterBandH hBand,
                                                    int bApproxOK,
                                                    double adfMinMax[2]);
+CPLErr CPL_DLL GDALComputeRasterMinMaxLocation(GDALRasterBandH hBand,
+                                               double *pdfMin, double *pdfMax,
+                                               int *pnMinX, int *pnMinY,
+                                               int *pnMaxX, int *pnMaxY);
 CPLErr CPL_DLL CPL_STDCALL GDALFlushRasterCache(GDALRasterBandH hBand);
 CPLErr CPL_DLL CPL_STDCALL GDALDropRasterCache(GDALRasterBandH hBand);
 CPLErr CPL_DLL CPL_STDCALL GDALGetRasterHistogram(
diff --git a/gcore/gdal_priv.h b/gcore/gdal_priv.h
index 20dcc3734cf3..c615a3a90319 100644
--- a/gcore/gdal_priv.h
+++ b/gcore/gdal_priv.h
@@ -1810,6 +1810,9 @@ class CPL_DLL GDALRasterBand : public GDALMajorObject
     virtual CPLErr SetStatistics(double dfMin, double dfMax, double dfMean,
                                  double dfStdDev);
     virtual CPLErr ComputeRasterMinMax(int bApproxOK, double *adfMinMax);
+    virtual CPLErr ComputeRasterMinMaxLocation(double *pdfMin, double *pdfMax,
+                                               int *pnMinX, int *pnMinY,
+                                               int *pnMaxX, int *pnMaxY);
 
 // Only defined when Doxygen enabled
 #ifdef DOXYGEN_SKIP
diff --git a/gcore/gdalrasterband.cpp b/gcore/gdalrasterband.cpp
index bf2da7a2893f..e84a20fdbef9 100644
--- a/gcore/gdalrasterband.cpp
+++ b/gcore/gdalrasterband.cpp
@@ -38,6 +38,7 @@
 #include "gdal_rat.h"
 #include "gdal_priv_templates.hpp"
 #include "gdal_interpolateatpoint.h"
+#include "gdal_minmax_element.hpp"
 
 /************************************************************************/
 /*                           GDALRasterBand()                           */
@@ -7415,6 +7416,288 @@ CPLErr CPL_STDCALL GDALComputeRasterMinMax(GDALRasterBandH hBand, int bApproxOK,
     return poBand->ComputeRasterMinMax(bApproxOK, adfMinMax);
 }
 
+/************************************************************************/
+/*                    ComputeRasterMinMaxLocation()                     */
+/************************************************************************/
+
+/**
+ * \brief Compute the min/max values for a band, and their location.
+ *
+ * Pixels whose value matches the nodata value or are masked by the mask
+ * band are ignored.
+ *
+ * If the minimum or maximum value is hit in several locations, it is not
+ * specified which one will be returned.
+ *
+ * @param[out] pdfMin Pointer to the minimum value.
+ * @param[out] pdfMax Pointer to the maximum value.
+ * @param[out] pnMinX Pointer to the column where the minimum value is hit.
+ * @param[out] pnMinY Pointer to the line where the minimum value is hit.
+ * @param[out] pnMaxX Pointer to the column where the maximum value is hit.
+ * @param[out] pnMaxY Pointer to the line where the maximum value is hit.
+ *
+ * @return CE_None in case of success, CE_Warning if there are no valid values,
+ *         CE_Failure in case of error.
+ *
+ * @since GDAL 3.11
+ */
+
+CPLErr GDALRasterBand::ComputeRasterMinMaxLocation(double *pdfMin,
+                                                   double *pdfMax, int *pnMinX,
+                                                   int *pnMinY, int *pnMaxX,
+                                                   int *pnMaxY)
+{
+    int nMinX = -1;
+    int nMinY = -1;
+    int nMaxX = -1;
+    int nMaxY = -1;
+    double dfMin = std::numeric_limits<double>::infinity();
+    double dfMax = -std::numeric_limits<double>::infinity();
+    if (pdfMin)
+        *pdfMin = dfMin;
+    if (pdfMax)
+        *pdfMax = dfMax;
+    if (pnMinX)
+        *pnMinX = nMinX;
+    if (pnMinY)
+        *pnMinY = nMinY;
+    if (pnMaxX)
+        *pnMaxX = nMaxX;
+    if (pnMaxY)
+        *pnMaxY = nMaxY;
+
+    if (GDALDataTypeIsComplex(eDataType))
+    {
+        CPLError(CE_Failure, CPLE_NotSupported,
+                 "Complex data type not supported");
+        return CE_Failure;
+    }
+
+    int bGotNoDataValue = FALSE;
+    const double dfNoDataValue = GetNoDataValue(&bGotNoDataValue);
+    bGotNoDataValue = bGotNoDataValue && !std::isnan(dfNoDataValue);
+    bool bGotFloatNoDataValue = false;
+    float fNoDataValue = 0.0f;
+    ComputeFloatNoDataValue(eDataType, dfNoDataValue, bGotNoDataValue,
+                            fNoDataValue, bGotFloatNoDataValue);
+
+    GDALRasterBand *poMaskBand = nullptr;
+    if (!bGotNoDataValue)
+    {
+        const int l_nMaskFlags = GetMaskFlags();
+        if (l_nMaskFlags != GMF_ALL_VALID && l_nMaskFlags != GMF_NODATA &&
+            GetColorInterpretation() != GCI_AlphaBand)
+        {
+            poMaskBand = GetMaskBand();
+        }
+    }
+
+    bool bSignedByte = false;
+    if (eDataType == GDT_Byte)
+    {
+        EnablePixelTypeSignedByteWarning(false);
+        const char *pszPixelType =
+            GetMetadataItem("PIXELTYPE", "IMAGE_STRUCTURE");
+        EnablePixelTypeSignedByteWarning(true);
+        bSignedByte =
+            pszPixelType != nullptr && EQUAL(pszPixelType, "SIGNEDBYTE");
+    }
+
+    GByte *pabyMaskData = nullptr;
+    if (poMaskBand)
+    {
+        pabyMaskData =
+            static_cast<GByte *>(VSI_MALLOC2_VERBOSE(nBlockXSize, nBlockYSize));
+        if (!pabyMaskData)
+        {
+            return CE_Failure;
+        }
+    }
+
+    if (!InitBlockInfo())
+        return CE_Failure;
+
+    const GIntBig nTotalBlocks =
+        static_cast<GIntBig>(nBlocksPerRow) * nBlocksPerColumn;
+    bool bNeedsMin = pdfMin || pnMinX || pnMinY;
+    bool bNeedsMax = pdfMax || pnMaxX || pnMaxY;
+    for (GIntBig iBlock = 0; iBlock < nTotalBlocks; ++iBlock)
+    {
+        const int iYBlock = static_cast<int>(iBlock / nBlocksPerRow);
+        const int iXBlock = static_cast<int>(iBlock % nBlocksPerRow);
+
+        GDALRasterBlock *poBlock = GetLockedBlockRef(iXBlock, iYBlock);
+        if (poBlock == nullptr)
+        {
+            CPLFree(pabyMaskData);
+            return CE_Failure;
+        }
+
+        void *const pData = poBlock->GetDataRef();
+
+        int nXCheck = 0, nYCheck = 0;
+        GetActualBlockSize(iXBlock, iYBlock, &nXCheck, &nYCheck);
+
+        if (poMaskBand &&
+            poMaskBand->RasterIO(GF_Read, iXBlock * nBlockXSize,
+                                 iYBlock * nBlockYSize, nXCheck, nYCheck,
+                                 pabyMaskData, nXCheck, nYCheck, GDT_Byte, 0,
+                                 nBlockXSize, nullptr) != CE_None)
+        {
+            poBlock->DropLock();
+            CPLFree(pabyMaskData);
+            return CE_Failure;
+        }
+
+        if (poMaskBand || nYCheck < nBlockYSize || nXCheck < nBlockXSize)
+        {
+            for (int iY = 0; iY < nYCheck; ++iY)
+            {
+                for (int iX = 0; iX < nXCheck; ++iX)
+                {
+                    const GPtrDiff_t iOffset =
+                        iX + static_cast<GPtrDiff_t>(iY) * nBlockXSize;
+                    if (pabyMaskData && pabyMaskData[iOffset] == 0)
+                        continue;
+                    bool bValid = true;
+                    double dfValue = GetPixelValue(
+                        eDataType, bSignedByte, pData, iOffset, bGotNoDataValue,
+                        dfNoDataValue, bGotFloatNoDataValue, fNoDataValue,
+                        bValid);
+                    if (!bValid)
+                        continue;
+                    if (dfValue < dfMin)
+                    {
+                        dfMin = dfValue;
+                        nMinX = iXBlock * nBlockXSize + iX;
+                        nMinY = iYBlock * nBlockYSize + iY;
+                    }
+                    if (dfValue > dfMax)
+                    {
+                        dfMax = dfValue;
+                        nMaxX = iXBlock * nBlockXSize + iX;
+                        nMaxY = iYBlock * nBlockYSize + iY;
+                    }
+                }
+            }
+        }
+        else
+        {
+            size_t pos_min = 0;
+            size_t pos_max = 0;
+            const auto eEffectiveDT = bSignedByte ? GDT_Int8 : eDataType;
+            if (bNeedsMin && bNeedsMax)
+            {
+                std::tie(pos_min, pos_max) = gdal::minmax_element(
+                    pData, static_cast<size_t>(nBlockXSize) * nBlockYSize,
+                    eEffectiveDT, bGotNoDataValue, dfNoDataValue);
+            }
+            else if (bNeedsMin)
+            {
+                pos_min = gdal::min_element(
+                    pData, static_cast<size_t>(nBlockXSize) * nBlockYSize,
+                    eEffectiveDT, bGotNoDataValue, dfNoDataValue);
+            }
+            else if (bNeedsMax)
+            {
+                pos_max = gdal::max_element(
+                    pData, static_cast<size_t>(nBlockXSize) * nBlockYSize,
+                    eEffectiveDT, bGotNoDataValue, dfNoDataValue);
+            }
+
+            if (bNeedsMin)
+            {
+                const int nMinXBlock = static_cast<int>(pos_min % nBlockXSize);
+                const int nMinYBlock = static_cast<int>(pos_min / nBlockXSize);
+                bool bValid = true;
+                const double dfMinValueBlock = GetPixelValue(
+                    eDataType, bSignedByte, pData, pos_min, bGotNoDataValue,
+                    dfNoDataValue, bGotFloatNoDataValue, fNoDataValue, bValid);
+                if (bValid && dfMinValueBlock < dfMin)
+                {
+                    dfMin = dfMinValueBlock;
+                    nMinX = iXBlock * nBlockXSize + nMinXBlock;
+                    nMinY = iYBlock * nBlockYSize + nMinYBlock;
+                }
+            }
+
+            if (bNeedsMax)
+            {
+                const int nMaxXBlock = static_cast<int>(pos_max % nBlockXSize);
+                const int nMaxYBlock = static_cast<int>(pos_max / nBlockXSize);
+                bool bValid = true;
+                const double dfMaxValueBlock = GetPixelValue(
+                    eDataType, bSignedByte, pData, pos_max, bGotNoDataValue,
+                    dfNoDataValue, bGotFloatNoDataValue, fNoDataValue, bValid);
+                if (bValid && dfMaxValueBlock > dfMax)
+                {
+                    dfMax = dfMaxValueBlock;
+                    nMaxX = iXBlock * nBlockXSize + nMaxXBlock;
+                    nMaxY = iYBlock * nBlockYSize + nMaxYBlock;
+                }
+            }
+        }
+
+        poBlock->DropLock();
+
+        if (eDataType == GDT_Byte)
+        {
+            if (bNeedsMin && dfMin == 0)
+            {
+                bNeedsMin = false;
+            }
+            if (bNeedsMax && dfMax == 255)
+            {
+                bNeedsMax = false;
+            }
+            if (!bNeedsMin && !bNeedsMax)
+            {
+                break;
+            }
+        }
+    }
+
+    CPLFree(pabyMaskData);
+
+    if (pdfMin)
+        *pdfMin = dfMin;
+    if (pdfMax)
+        *pdfMax = dfMax;
+    if (pnMinX)
+        *pnMinX = nMinX;
+    if (pnMinY)
+        *pnMinY = nMinY;
+    if (pnMaxX)
+        *pnMaxX = nMaxX;
+    if (pnMaxY)
+        *pnMaxY = nMaxY;
+    return ((bNeedsMin && nMinX < 0) || (bNeedsMax && nMaxX < 0)) ? CE_Warning
+                                                                  : CE_None;
+}
+
+/************************************************************************/
+/*                    GDALComputeRasterMinMaxLocation()                 */
+/************************************************************************/
+
+/**
+ * \brief Compute the min/max values for a band, and their location.
+ *
+ * @see GDALRasterBand::ComputeRasterMinMax()
+ * @since GDAL 3.11
+ */
+
+CPLErr GDALComputeRasterMinMaxLocation(GDALRasterBandH hBand, double *pdfMin,
+                                       double *pdfMax, int *pnMinX, int *pnMinY,
+                                       int *pnMaxX, int *pnMaxY)
+
+{
+    VALIDATE_POINTER1(hBand, "GDALComputeRasterMinMaxLocation", CE_Failure);
+
+    GDALRasterBand *poBand = GDALRasterBand::FromHandle(hBand);
+    return poBand->ComputeRasterMinMaxLocation(pdfMin, pdfMax, pnMinX, pnMinY,
+                                               pnMaxX, pnMaxY);
+}
+
 /************************************************************************/
 /*                        SetDefaultHistogram()                         */
 /************************************************************************/
diff --git a/swig/include/Band.i b/swig/include/Band.i
index ea4fd47f6e22..6c45f714e078 100644
--- a/swig/include/Band.i
+++ b/swig/include/Band.i
@@ -669,6 +669,23 @@ CPLErr AdviseRead(  int xoff, int yoff, int xsize, int ysize,
 %clear (CPLErr);
 #endif
 
+%apply (double *OUTPUT){double *pdfMin, double *pdfMax};
+%apply (int *OUTPUT){int *pnMinX, int *pnMinY};
+%apply (int *OUTPUT){int *pnMaxX, int *pnMaxY};
+#if !defined(SWIGPYTHON)
+%apply (IF_ERROR_RETURN_NONE) { (CPLErr) };
+#endif
+  CPLErr ComputeMinMaxLocation( double *pdfMin, double *pdfMax,
+                                int *pnMinX, int *pnMinY,
+                                int *pnMaxX, int *pnMaxY ) {
+    return GDALComputeRasterMinMaxLocation( self, pdfMin, pdfMax,
+                                            pnMinX, pnMinY,
+                                            pnMaxX, pnMaxY );
+  }
+#if !defined(SWIGPYTHON)
+%clear (CPLErr);
+#endif
+
 %newobject AsMDArray;
   GDALMDArrayHS *AsMDArray()
   {
diff --git a/swig/include/python/gdal_python.i b/swig/include/python/gdal_python.i
index fac167ee295b..8282e1d425a1 100644
--- a/swig/include/python/gdal_python.i
+++ b/swig/include/python/gdal_python.i
@@ -4997,6 +4997,50 @@ def InterpolateAtPoint(self, *args, **kwargs):
         return ret[1]
 %}
 
+%feature("shadow") ComputeMinMaxLocation %{
+def ComputeMinMaxLocation(self, *args, **kwargs):
+    """Compute the min/max values for a band, and their location.
+
+       Pixels whose value matches the nodata value or are masked by the mask
+       band are ignored.
+
+       If the minimum or maximum value is hit in several locations, it is not
+       specified which one will be returned.
+
+       This is a mapping of :cpp:func:`GDALRasterBand::ComputeRasterMinMaxLocation`.
+
+       Parameters
+       ----------
+       None
+
+       Returns
+       -------
+       a named tuple (min, max, minX, minY, maxX, maxY) or or ``None``
+       in case of error or no valid pixel.
+    """
+
+    ret = $action(self, *args, **kwargs)
+    if ret[0] != CE_None:
+        return None
+
+    import collections
+    tuple = collections.namedtuple('ComputeMinMaxLocationResult',
+            ['min',
+             'max',
+             'minX',
+             'minY',
+             'maxX',
+             'maxY',
+             ])
+    tuple.min = ret[1]
+    tuple.max = ret[2]
+    tuple.minX = ret[3]
+    tuple.minY = ret[4]
+    tuple.maxX = ret[5]
+    tuple.maxY = ret[6]
+    return tuple
+%}
+
 %pythoncode %{
 
 # VSIFile: Copyright (c) 2024, Dan Baston <dbaston at gmail.com>
diff --git a/swig/python/gdal-utils/osgeo_utils/samples/gdal_minmax_location.py b/swig/python/gdal-utils/osgeo_utils/samples/gdal_minmax_location.py
new file mode 100644
index 000000000000..018ec00f0d29
--- /dev/null
+++ b/swig/python/gdal-utils/osgeo_utils/samples/gdal_minmax_location.py
@@ -0,0 +1,128 @@
+# !/usr/bin/env python3
+###############################################################################
+# Project:  GDAL utils
+# Purpose:  Get min/max location
+# Author:   Even Rouault <even@spatialys.com>
+#
+###############################################################################
+# Copyright (c) 2024, Even Rouault <even@spatialys.com>
+#
+# SPDX-License-Identifier: MIT
+###############################################################################
+
+import sys
+import textwrap
+from typing import Optional
+
+from osgeo import gdal, osr
+from osgeo_utils.auxiliary.gdal_argparse import GDALArgumentParser, GDALScript
+from osgeo_utils.auxiliary.util import PathOrDS, open_ds
+
+
+def gdalminmaxlocation_util(
+    filename_or_ds: PathOrDS,
+    band_num: int,
+    open_options: Optional[dict] = None,
+    **kwargs,
+):
+    ds = open_ds(filename_or_ds, open_options=open_options)
+    band = ds.GetRasterBand(band_num)
+    ret = band.ComputeMinMaxLocation()
+    if ret is None:
+        print("No valid pixels")
+        return 1
+    gt = ds.GetGeoTransform(can_return_null=True)
+    if gt:
+        srs = ds.GetSpatialRef()
+        if srs:
+            wgs84 = osr.SpatialReference()
+            wgs84.SetFromUserInput("WGS84")
+            wgs84.SetAxisMappingStrategy(osr.OAMS_TRADITIONAL_GIS_ORDER)
+            ct = osr.CreateCoordinateTransformation(srs, wgs84)
+            georefX, georefY = gdal.ApplyGeoTransform(
+                gt, ret.minX + 0.5, ret.minY + 0.5
+            )
+            long, lat, _ = ct.TransformPoint(georefX, georefY)
+            print(
+                f"Minimum={ret.min} at (col,line)=({ret.minX},{ret.minY}), (X,Y)_georef=({georefX},{georefY}), (long,lat)_WGS84=({long:.7f},{lat:.7f})"
+            )
+            georefX, georefY = gdal.ApplyGeoTransform(
+                gt, ret.maxX + 0.5, ret.maxY + 0.5
+            )
+            long, lat, _ = ct.TransformPoint(georefX, georefY)
+            print(
+                f"Maximum={ret.max} at (col,line)=({ret.maxX},{ret.maxY}), (X,Y)_georef=({georefX},{georefY}), (long,lat)_WGS84=({long:.7f},{lat:.7f})"
+            )
+        else:
+            georefX, georefY = gdal.ApplyGeoTransform(
+                gt, ret.minX + 0.5, ret.minY + 0.5
+            )
+            print(
+                f"Minimum={ret.min} at (col,line)=({ret.minX},{ret.minY}), (X,Y)_georef=({georefX},{georefY})"
+            )
+            georefX, georefY = gdal.ApplyGeoTransform(
+                gt, ret.maxX + 0.5, ret.maxY + 0.5
+            )
+            print(
+                f"Maximum={ret.max} at (col,line)=({ret.maxX},{ret.maxY}), (X,Y)_georef=({georefX},{georefY})"
+            )
+    else:
+        print(f"Minimum={ret.min} at (col,line)=({ret.minX},{ret.minY})")
+        print(f"Maximum={ret.max} at (col,line)=({ret.maxX},{ret.maxY})")
+
+    return 0
+
+
+class GDALMinMaxLocation(GDALScript):
+    def __init__(self):
+        super().__init__()
+        self.title = "Raster min/max location query tool"
+        self.description = textwrap.dedent(
+            """\
+            The gdal_minmax_location utility returns the location where min/max values of a raster are hit."""
+        )
+        self.interactive_mode = None
+
+    def get_parser(self, argv) -> GDALArgumentParser:
+        parser = self.parser
+
+        parser.add_argument(
+            "-b",
+            dest="band_num",
+            metavar="band",
+            type=int,
+            default=1,
+            help="Selects a band to query (default: first one).",
+        )
+
+        parser.add_argument(
+            "-oo",
+            dest="open_options",
+            metavar="NAME=VALUE",
+            help="Dataset open option (format specific).",
+            nargs="+",
+        )
+
+        parser.add_argument(
+            "filename_or_ds",
+            metavar="filename",
+            type=str,
+            help="The source GDAL raster datasource name.",
+        )
+
+        return parser
+
+    def augment_kwargs(self, kwargs) -> dict:
+        return kwargs
+
+    def doit(self, **kwargs):
+        return gdalminmaxlocation_util(**kwargs)
+
+
+def main(argv=sys.argv):
+    gdal.UseExceptions()
+    return GDALMinMaxLocation().main(argv)
+
+
+if __name__ == "__main__":
+    sys.exit(main(sys.argv))

From 4ff774993eb853683c295c916bd551b780ccc474 Mon Sep 17 00:00:00 2001
From: Even Rouault <even.rouault@spatialys.com>
Date: Sun, 17 Mar 2024 00:23:47 +0100
Subject: [PATCH 19/62] Add gcore/sse2neon.h

---
 .pre-commit-config.yaml |    1 +
 Doxyfile                |    4 +-
 gcore/sse2neon.h        | 9399 +++++++++++++++++++++++++++++++++++++++
 3 files changed, 9403 insertions(+), 1 deletion(-)
 create mode 100644 gcore/sse2neon.h

diff --git a/.pre-commit-config.yaml b/.pre-commit-config.yaml
index f45a709ff7d0..058d13cf2877 100644
--- a/.pre-commit-config.yaml
+++ b/.pre-commit-config.yaml
@@ -56,6 +56,7 @@ repos:
                 frmts/pcidsk/sdk|
                 frmts/grib/degrib/degrib|
                 frmts/grib/degrib/g2clib|
+                gcore/sse2neon.h|
                 port/utf8.h|
                 ogr/ogrsf_frmts/adbc/ogr_adbc_internal.h|
                 ogr/ogrsf_frmts/cad/libopencad/|
diff --git a/Doxyfile b/Doxyfile
index b22e38ec20c7..4a896612e813 100644
--- a/Doxyfile
+++ b/Doxyfile
@@ -415,7 +415,9 @@ RECURSIVE              = NO
 # subdirectory from a directory tree whose root is specified with the INPUT tag.
 
 EXCLUDE                = gcore/rawdataset.cpp \
-			 gcore/rawdataset.h
+                         gcore/rawdataset.h \
+                         gcore/include_sse2neon.h \
+                         gcore/sse2neon.h
 
 # The EXCLUDE_SYMLINKS tag can be used select whether or not files or
 # directories that are symbolic links (a Unix filesystem feature) are excluded
diff --git a/gcore/sse2neon.h b/gcore/sse2neon.h
new file mode 100644
index 000000000000..10a1196ebd39
--- /dev/null
+++ b/gcore/sse2neon.h
@@ -0,0 +1,9399 @@
+#ifndef SSE2NEON_H
+#define SSE2NEON_H
+
+/*
+ * sse2neon is freely redistributable under the MIT License.
+ *
+ * Copyright (c) 2015-2024 SSE2NEON Contributors.
+ *
+ * Permission is hereby granted, free of charge, to any person obtaining a copy
+ * of this software and associated documentation files (the "Software"), to deal
+ * in the Software without restriction, including without limitation the rights
+ * to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
+ * copies of the Software, and to permit persons to whom the Software is
+ * furnished to do so, subject to the following conditions:
+ *
+ * The above copyright notice and this permission notice shall be included in
+ * all copies or substantial portions of the Software.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+ * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+ * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
+ * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+ * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
+ * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
+ * SOFTWARE.
+ */
+
+// This header file provides a simple API translation layer
+// between SSE intrinsics to their corresponding Arm/Aarch64 NEON versions
+//
+// Contributors to this work are:
+//   John W. Ratcliff <jratcliffscarab@gmail.com>
+//   Brandon Rowlett <browlett@nvidia.com>
+//   Ken Fast <kfast@gdeb.com>
+//   Eric van Beurden <evanbeurden@nvidia.com>
+//   Alexander Potylitsin <apotylitsin@nvidia.com>
+//   Hasindu Gamaarachchi <hasindu2008@gmail.com>
+//   Jim Huang <jserv@ccns.ncku.edu.tw>
+//   Mark Cheng <marktwtn@gmail.com>
+//   Malcolm James MacLeod <malcolm@gulden.com>
+//   Devin Hussey (easyaspi314) <husseydevin@gmail.com>
+//   Sebastian Pop <spop@amazon.com>
+//   Developer Ecosystem Engineering <DeveloperEcosystemEngineering@apple.com>
+//   Danila Kutenin <danilak@google.com>
+//   François Turban (JishinMaster) <francois.turban@gmail.com>
+//   Pei-Hsuan Hung <afcidk@gmail.com>
+//   Yang-Hao Yuan <yuanyanghau@gmail.com>
+//   Syoyo Fujita <syoyo@lighttransport.com>
+//   Brecht Van Lommel <brecht@blender.org>
+//   Jonathan Hue <jhue@adobe.com>
+//   Cuda Chen <clh960524@gmail.com>
+//   Aymen Qader <aymen.qader@arm.com>
+//   Anthony Roberts <anthony.roberts@linaro.org>
+
+/* Tunable configurations */
+
+/* Enable precise implementation of math operations
+ * This would slow down the computation a bit, but gives consistent result with
+ * x86 SSE. (e.g. would solve a hole or NaN pixel in the rendering result)
+ */
+/* _mm_min|max_ps|ss|pd|sd */
+#ifndef SSE2NEON_PRECISE_MINMAX
+#define SSE2NEON_PRECISE_MINMAX (0)
+#endif
+/* _mm_rcp_ps */
+#ifndef SSE2NEON_PRECISE_DIV
+#define SSE2NEON_PRECISE_DIV (0)
+#endif
+/* _mm_sqrt_ps and _mm_rsqrt_ps */
+#ifndef SSE2NEON_PRECISE_SQRT
+#define SSE2NEON_PRECISE_SQRT (0)
+#endif
+/* _mm_dp_pd */
+#ifndef SSE2NEON_PRECISE_DP
+#define SSE2NEON_PRECISE_DP (0)
+#endif
+
+/* Enable inclusion of windows.h on MSVC platforms
+ * This makes _mm_clflush functional on windows, as there is no builtin.
+ */
+#ifndef SSE2NEON_INCLUDE_WINDOWS_H
+#define SSE2NEON_INCLUDE_WINDOWS_H (0)
+#endif
+
+/* compiler specific definitions */
+#if defined(__GNUC__) || defined(__clang__)
+#pragma push_macro("FORCE_INLINE")
+#pragma push_macro("ALIGN_STRUCT")
+#define FORCE_INLINE static inline __attribute__((always_inline))
+#define ALIGN_STRUCT(x) __attribute__((aligned(x)))
+#define _sse2neon_likely(x) __builtin_expect(!!(x), 1)
+#define _sse2neon_unlikely(x) __builtin_expect(!!(x), 0)
+#elif defined(_MSC_VER)
+#if _MSVC_TRADITIONAL
+#error Using the traditional MSVC preprocessor is not supported! Use /Zc:preprocessor instead.
+#endif
+#ifndef FORCE_INLINE
+#define FORCE_INLINE static inline
+#endif
+#ifndef ALIGN_STRUCT
+#define ALIGN_STRUCT(x) __declspec(align(x))
+#endif
+#define _sse2neon_likely(x) (x)
+#define _sse2neon_unlikely(x) (x)
+#else
+#pragma message("Macro name collisions may happen with unsupported compilers.")
+#endif
+
+#if !defined(__clang__) && defined(__GNUC__) && __GNUC__ < 10
+#warning "GCC versions earlier than 10 are not supported."
+#endif
+
+#ifdef __OPTIMIZE__
+#warning \
+    "Report any potential compiler optimization issues when using SSE2NEON. See the 'Optimization' section at https://github.com/DLTcollab/sse2neon."
+#endif
+
+/* C language does not allow initializing a variable with a function call. */
+#ifdef __cplusplus
+#define _sse2neon_const static const
+#else
+#define _sse2neon_const const
+#endif
+
+#include <fenv.h>
+#include <stdint.h>
+#include <stdlib.h>
+#include <string.h>
+
+FORCE_INLINE double sse2neon_recast_u64_f64(uint64_t u64)
+{
+    double f64;
+    memcpy(&f64, &u64, sizeof(uint64_t));
+    return f64;
+}
+FORCE_INLINE int64_t sse2neon_recast_f64_s64(double f64)
+{
+    int64_t i64;
+    memcpy(&i64, &f64, sizeof(uint64_t));
+    return i64;
+}
+
+#if defined(_WIN32)
+/* Definitions for _mm_{malloc,free} are provided by <malloc.h>
+ * from both MinGW-w64 and MSVC.
+ */
+#define SSE2NEON_ALLOC_DEFINED
+#endif
+
+/* If using MSVC */
+#ifdef _MSC_VER
+#include <intrin.h>
+#if SSE2NEON_INCLUDE_WINDOWS_H
+#include <processthreadsapi.h>
+#include <windows.h>
+#endif
+
+#if !defined(__cplusplus)
+#error SSE2NEON only supports C++ compilation with this compiler
+#endif
+
+#ifdef SSE2NEON_ALLOC_DEFINED
+#include <malloc.h>
+#endif
+
+#if (defined(_M_AMD64) || defined(__x86_64__)) || \
+    (defined(_M_ARM64) || defined(__arm64__))
+#define SSE2NEON_HAS_BITSCAN64
+#endif
+#endif
+
+#if defined(__GNUC__) || defined(__clang__)
+#define _sse2neon_define0(type, s, body) \
+    __extension__({                      \
+        type _a = (s);                   \
+        body                             \
+    })
+#define _sse2neon_define1(type, s, body) \
+    __extension__({                      \
+        type _a = (s);                   \
+        body                             \
+    })
+#define _sse2neon_define2(type, a, b, body) \
+    __extension__({                         \
+        type _a = (a), _b = (b);            \
+        body                                \
+    })
+#define _sse2neon_return(ret) (ret)
+#else
+#define _sse2neon_define0(type, a, body) [=](type _a) { body }(a)
+#define _sse2neon_define1(type, a, body) [](type _a) { body }(a)
+#define _sse2neon_define2(type, a, b, body) \
+    [](type _a, type _b) { body }((a), (b))
+#define _sse2neon_return(ret) return ret
+#endif
+
+#define _sse2neon_init(...) \
+    {                       \
+        __VA_ARGS__         \
+    }
+
+/* Compiler barrier */
+#if defined(_MSC_VER) && !defined(__clang__)
+#define SSE2NEON_BARRIER() _ReadWriteBarrier()
+#else
+#define SSE2NEON_BARRIER()                     \
+    do {                                       \
+        __asm__ __volatile__("" ::: "memory"); \
+        (void) 0;                              \
+    } while (0)
+#endif
+
+/* Memory barriers
+ * __atomic_thread_fence does not include a compiler barrier; instead,
+ * the barrier is part of __atomic_load/__atomic_store's "volatile-like"
+ * semantics.
+ */
+#if defined(__STDC_VERSION__) && (__STDC_VERSION__ >= 201112L)
+#include <stdatomic.h>
+#endif
+
+FORCE_INLINE void _sse2neon_smp_mb(void)
+{
+    SSE2NEON_BARRIER();
+#if defined(__STDC_VERSION__) && (__STDC_VERSION__ >= 201112L) && \
+    !defined(__STDC_NO_ATOMICS__)
+    atomic_thread_fence(memory_order_seq_cst);
+#elif defined(__GNUC__) || defined(__clang__)
+    __atomic_thread_fence(__ATOMIC_SEQ_CST);
+#else /* MSVC */
+    __dmb(_ARM64_BARRIER_ISH);
+#endif
+}
+
+/* Architecture-specific build options */
+/* FIXME: #pragma GCC push_options is only available on GCC */
+#if defined(__GNUC__)
+#if defined(__arm__) && __ARM_ARCH == 7
+/* According to ARM C Language Extensions Architecture specification,
+ * __ARM_NEON is defined to a value indicating the Advanced SIMD (NEON)
+ * architecture supported.
+ */
+#if !defined(__ARM_NEON) || !defined(__ARM_NEON__)
+#error "You must enable NEON instructions (e.g. -mfpu=neon) to use SSE2NEON."
+#endif
+#if !defined(__clang__)
+#pragma GCC push_options
+#pragma GCC target("fpu=neon")
+#endif
+#elif defined(__aarch64__) || defined(_M_ARM64)
+#if !defined(__clang__) && !defined(_MSC_VER)
+#pragma GCC push_options
+#pragma GCC target("+simd")
+#endif
+#elif __ARM_ARCH == 8
+#if !defined(__ARM_NEON) || !defined(__ARM_NEON__)
+#error \
+    "You must enable NEON instructions (e.g. -mfpu=neon-fp-armv8) to use SSE2NEON."
+#endif
+#if !defined(__clang__) && !defined(_MSC_VER)
+#pragma GCC push_options
+#endif
+#else
+#error \
+    "Unsupported target. Must be either ARMv7-A+NEON or ARMv8-A \
+(you could try setting target explicitly with -march or -mcpu)"
+#endif
+#endif
+
+#include <arm_neon.h>
+#if (!defined(__aarch64__) && !defined(_M_ARM64)) && (__ARM_ARCH == 8)
+#if defined __has_include && __has_include(<arm_acle.h>)
+#include <arm_acle.h>
+#endif
+#endif
+
+/* Apple Silicon cache lines are double of what is commonly used by Intel, AMD
+ * and other Arm microarchitectures use.
+ * From sysctl -a on Apple M1:
+ * hw.cachelinesize: 128
+ */
+#if defined(__APPLE__) && (defined(__aarch64__) || defined(__arm64__))
+#define SSE2NEON_CACHELINE_SIZE 128
+#else
+#define SSE2NEON_CACHELINE_SIZE 64
+#endif
+
+/* Rounding functions require either Aarch64 instructions or libm fallback */
+#if !defined(__aarch64__) && !defined(_M_ARM64)
+#include <math.h>
+#endif
+
+/* On ARMv7, some registers, such as PMUSERENR and PMCCNTR, are read-only
+ * or even not accessible in user mode.
+ * To write or access to these registers in user mode,
+ * we have to perform syscall instead.
+ */
+#if (!defined(__aarch64__) && !defined(_M_ARM64))
+#include <sys/time.h>
+#endif
+
+/* "__has_builtin" can be used to query support for built-in functions
+ * provided by gcc/clang and other compilers that support it.
+ */
+#ifndef __has_builtin /* GCC prior to 10 or non-clang compilers */
+/* Compatibility with gcc <= 9 */
+#if defined(__GNUC__) && (__GNUC__ <= 9)
+#define __has_builtin(x) HAS##x
+#define HAS__builtin_popcount 1
+#define HAS__builtin_popcountll 1
+
+// __builtin_shuffle introduced in GCC 4.7.0
+#if (__GNUC__ >= 5) || ((__GNUC__ == 4) && (__GNUC_MINOR__ >= 7))
+#define HAS__builtin_shuffle 1
+#else
+#define HAS__builtin_shuffle 0
+#endif
+
+#define HAS__builtin_shufflevector 0
+#define HAS__builtin_nontemporal_store 0
+#else
+#define __has_builtin(x) 0
+#endif
+#endif
+
+/**
+ * MACRO for shuffle parameter for _mm_shuffle_ps().
+ * Argument fp3 is a digit[0123] that represents the fp from argument "b"
+ * of mm_shuffle_ps that will be placed in fp3 of result. fp2 is the same
+ * for fp2 in result. fp1 is a digit[0123] that represents the fp from
+ * argument "a" of mm_shuffle_ps that will be places in fp1 of result.
+ * fp0 is the same for fp0 of result.
+ */
+#define _MM_SHUFFLE(fp3, fp2, fp1, fp0) \
+    (((fp3) << 6) | ((fp2) << 4) | ((fp1) << 2) | ((fp0)))
+
+#if __has_builtin(__builtin_shufflevector)
+#define _sse2neon_shuffle(type, a, b, ...) \
+    __builtin_shufflevector(a, b, __VA_ARGS__)
+#elif __has_builtin(__builtin_shuffle)
+#define _sse2neon_shuffle(type, a, b, ...) \
+    __extension__({                        \
+        type tmp = {__VA_ARGS__};          \
+        __builtin_shuffle(a, b, tmp);      \
+    })
+#endif
+
+#ifdef _sse2neon_shuffle
+#define vshuffle_s16(a, b, ...) _sse2neon_shuffle(int16x4_t, a, b, __VA_ARGS__)
+#define vshuffleq_s16(a, b, ...) _sse2neon_shuffle(int16x8_t, a, b, __VA_ARGS__)
+#define vshuffle_s32(a, b, ...) _sse2neon_shuffle(int32x2_t, a, b, __VA_ARGS__)
+#define vshuffleq_s32(a, b, ...) _sse2neon_shuffle(int32x4_t, a, b, __VA_ARGS__)
+#define vshuffle_s64(a, b, ...) _sse2neon_shuffle(int64x1_t, a, b, __VA_ARGS__)
+#define vshuffleq_s64(a, b, ...) _sse2neon_shuffle(int64x2_t, a, b, __VA_ARGS__)
+#endif
+
+/* Rounding mode macros. */
+#define _MM_FROUND_TO_NEAREST_INT 0x00
+#define _MM_FROUND_TO_NEG_INF 0x01
+#define _MM_FROUND_TO_POS_INF 0x02
+#define _MM_FROUND_TO_ZERO 0x03
+#define _MM_FROUND_CUR_DIRECTION 0x04
+#define _MM_FROUND_NO_EXC 0x08
+#define _MM_FROUND_RAISE_EXC 0x00
+#define _MM_FROUND_NINT (_MM_FROUND_TO_NEAREST_INT | _MM_FROUND_RAISE_EXC)
+#define _MM_FROUND_FLOOR (_MM_FROUND_TO_NEG_INF | _MM_FROUND_RAISE_EXC)
+#define _MM_FROUND_CEIL (_MM_FROUND_TO_POS_INF | _MM_FROUND_RAISE_EXC)
+#define _MM_FROUND_TRUNC (_MM_FROUND_TO_ZERO | _MM_FROUND_RAISE_EXC)
+#define _MM_FROUND_RINT (_MM_FROUND_CUR_DIRECTION | _MM_FROUND_RAISE_EXC)
+#define _MM_FROUND_NEARBYINT (_MM_FROUND_CUR_DIRECTION | _MM_FROUND_NO_EXC)
+#define _MM_ROUND_NEAREST 0x0000
+#define _MM_ROUND_DOWN 0x2000
+#define _MM_ROUND_UP 0x4000
+#define _MM_ROUND_TOWARD_ZERO 0x6000
+/* Flush zero mode macros. */
+#define _MM_FLUSH_ZERO_MASK 0x8000
+#define _MM_FLUSH_ZERO_ON 0x8000
+#define _MM_FLUSH_ZERO_OFF 0x0000
+/* Denormals are zeros mode macros. */
+#define _MM_DENORMALS_ZERO_MASK 0x0040
+#define _MM_DENORMALS_ZERO_ON 0x0040
+#define _MM_DENORMALS_ZERO_OFF 0x0000
+
+/* indicate immediate constant argument in a given range */
+#define __constrange(a, b) const
+
+/* A few intrinsics accept traditional data types like ints or floats, but
+ * most operate on data types that are specific to SSE.
+ * If a vector type ends in d, it contains doubles, and if it does not have
+ * a suffix, it contains floats. An integer vector type can contain any type
+ * of integer, from chars to shorts to unsigned long longs.
+ */
+typedef int64x1_t __m64;
+typedef float32x4_t __m128; /* 128-bit vector containing 4 floats */
+// On ARM 32-bit architecture, the float64x2_t is not supported.
+// The data type __m128d should be represented in a different way for related
+// intrinsic conversion.
+#if defined(__aarch64__) || defined(_M_ARM64)
+typedef float64x2_t __m128d; /* 128-bit vector containing 2 doubles */
+#else
+typedef float32x4_t __m128d;
+#endif
+typedef int64x2_t __m128i; /* 128-bit vector containing integers */
+
+// Some intrinsics operate on unaligned data types.
+typedef int16_t ALIGN_STRUCT(1) unaligned_int16_t;
+typedef int32_t ALIGN_STRUCT(1) unaligned_int32_t;
+typedef int64_t ALIGN_STRUCT(1) unaligned_int64_t;
+
+// __int64 is defined in the Intrinsics Guide which maps to different datatype
+// in different data model
+#if !(defined(_WIN32) || defined(_WIN64) || defined(__int64))
+#if (defined(__x86_64__) || defined(__i386__))
+#define __int64 long long
+#else
+#define __int64 int64_t
+#endif
+#endif
+
+/* type-safe casting between types */
+
+#define vreinterpretq_m128_f16(x) vreinterpretq_f32_f16(x)
+#define vreinterpretq_m128_f32(x) (x)
+#define vreinterpretq_m128_f64(x) vreinterpretq_f32_f64(x)
+
+#define vreinterpretq_m128_u8(x) vreinterpretq_f32_u8(x)
+#define vreinterpretq_m128_u16(x) vreinterpretq_f32_u16(x)
+#define vreinterpretq_m128_u32(x) vreinterpretq_f32_u32(x)
+#define vreinterpretq_m128_u64(x) vreinterpretq_f32_u64(x)
+
+#define vreinterpretq_m128_s8(x) vreinterpretq_f32_s8(x)
+#define vreinterpretq_m128_s16(x) vreinterpretq_f32_s16(x)
+#define vreinterpretq_m128_s32(x) vreinterpretq_f32_s32(x)
+#define vreinterpretq_m128_s64(x) vreinterpretq_f32_s64(x)
+
+#define vreinterpretq_f16_m128(x) vreinterpretq_f16_f32(x)
+#define vreinterpretq_f32_m128(x) (x)
+#define vreinterpretq_f64_m128(x) vreinterpretq_f64_f32(x)
+
+#define vreinterpretq_u8_m128(x) vreinterpretq_u8_f32(x)
+#define vreinterpretq_u16_m128(x) vreinterpretq_u16_f32(x)
+#define vreinterpretq_u32_m128(x) vreinterpretq_u32_f32(x)
+#define vreinterpretq_u64_m128(x) vreinterpretq_u64_f32(x)
+
+#define vreinterpretq_s8_m128(x) vreinterpretq_s8_f32(x)
+#define vreinterpretq_s16_m128(x) vreinterpretq_s16_f32(x)
+#define vreinterpretq_s32_m128(x) vreinterpretq_s32_f32(x)
+#define vreinterpretq_s64_m128(x) vreinterpretq_s64_f32(x)
+
+#define vreinterpretq_m128i_s8(x) vreinterpretq_s64_s8(x)
+#define vreinterpretq_m128i_s16(x) vreinterpretq_s64_s16(x)
+#define vreinterpretq_m128i_s32(x) vreinterpretq_s64_s32(x)
+#define vreinterpretq_m128i_s64(x) (x)
+
+#define vreinterpretq_m128i_u8(x) vreinterpretq_s64_u8(x)
+#define vreinterpretq_m128i_u16(x) vreinterpretq_s64_u16(x)
+#define vreinterpretq_m128i_u32(x) vreinterpretq_s64_u32(x)
+#define vreinterpretq_m128i_u64(x) vreinterpretq_s64_u64(x)
+
+#define vreinterpretq_f32_m128i(x) vreinterpretq_f32_s64(x)
+#define vreinterpretq_f64_m128i(x) vreinterpretq_f64_s64(x)
+
+#define vreinterpretq_s8_m128i(x) vreinterpretq_s8_s64(x)
+#define vreinterpretq_s16_m128i(x) vreinterpretq_s16_s64(x)
+#define vreinterpretq_s32_m128i(x) vreinterpretq_s32_s64(x)
+#define vreinterpretq_s64_m128i(x) (x)
+
+#define vreinterpretq_u8_m128i(x) vreinterpretq_u8_s64(x)
+#define vreinterpretq_u16_m128i(x) vreinterpretq_u16_s64(x)
+#define vreinterpretq_u32_m128i(x) vreinterpretq_u32_s64(x)
+#define vreinterpretq_u64_m128i(x) vreinterpretq_u64_s64(x)
+
+#define vreinterpret_m64_s8(x) vreinterpret_s64_s8(x)
+#define vreinterpret_m64_s16(x) vreinterpret_s64_s16(x)
+#define vreinterpret_m64_s32(x) vreinterpret_s64_s32(x)
+#define vreinterpret_m64_s64(x) (x)
+
+#define vreinterpret_m64_u8(x) vreinterpret_s64_u8(x)
+#define vreinterpret_m64_u16(x) vreinterpret_s64_u16(x)
+#define vreinterpret_m64_u32(x) vreinterpret_s64_u32(x)
+#define vreinterpret_m64_u64(x) vreinterpret_s64_u64(x)
+
+#define vreinterpret_m64_f16(x) vreinterpret_s64_f16(x)
+#define vreinterpret_m64_f32(x) vreinterpret_s64_f32(x)
+#define vreinterpret_m64_f64(x) vreinterpret_s64_f64(x)
+
+#define vreinterpret_u8_m64(x) vreinterpret_u8_s64(x)
+#define vreinterpret_u16_m64(x) vreinterpret_u16_s64(x)
+#define vreinterpret_u32_m64(x) vreinterpret_u32_s64(x)
+#define vreinterpret_u64_m64(x) vreinterpret_u64_s64(x)
+
+#define vreinterpret_s8_m64(x) vreinterpret_s8_s64(x)
+#define vreinterpret_s16_m64(x) vreinterpret_s16_s64(x)
+#define vreinterpret_s32_m64(x) vreinterpret_s32_s64(x)
+#define vreinterpret_s64_m64(x) (x)
+
+#define vreinterpret_f32_m64(x) vreinterpret_f32_s64(x)
+
+#if defined(__aarch64__) || defined(_M_ARM64)
+#define vreinterpretq_m128d_s32(x) vreinterpretq_f64_s32(x)
+#define vreinterpretq_m128d_s64(x) vreinterpretq_f64_s64(x)
+
+#define vreinterpretq_m128d_u64(x) vreinterpretq_f64_u64(x)
+
+#define vreinterpretq_m128d_f32(x) vreinterpretq_f64_f32(x)
+#define vreinterpretq_m128d_f64(x) (x)
+
+#define vreinterpretq_s64_m128d(x) vreinterpretq_s64_f64(x)
+
+#define vreinterpretq_u32_m128d(x) vreinterpretq_u32_f64(x)
+#define vreinterpretq_u64_m128d(x) vreinterpretq_u64_f64(x)
+
+#define vreinterpretq_f64_m128d(x) (x)
+#define vreinterpretq_f32_m128d(x) vreinterpretq_f32_f64(x)
+#else
+#define vreinterpretq_m128d_s32(x) vreinterpretq_f32_s32(x)
+#define vreinterpretq_m128d_s64(x) vreinterpretq_f32_s64(x)
+
+#define vreinterpretq_m128d_u32(x) vreinterpretq_f32_u32(x)
+#define vreinterpretq_m128d_u64(x) vreinterpretq_f32_u64(x)
+
+#define vreinterpretq_m128d_f32(x) (x)
+
+#define vreinterpretq_s64_m128d(x) vreinterpretq_s64_f32(x)
+
+#define vreinterpretq_u32_m128d(x) vreinterpretq_u32_f32(x)
+#define vreinterpretq_u64_m128d(x) vreinterpretq_u64_f32(x)
+
+#define vreinterpretq_f32_m128d(x) (x)
+#endif
+
+// A struct is defined in this header file called 'SIMDVec' which can be used
+// by applications which attempt to access the contents of an __m128 struct
+// directly.  It is important to note that accessing the __m128 struct directly
+// is bad coding practice by Microsoft: @see:
+// https://learn.microsoft.com/en-us/cpp/cpp/m128
+//
+// However, some legacy source code may try to access the contents of an __m128
+// struct directly so the developer can use the SIMDVec as an alias for it.  Any
+// casting must be done manually by the developer, as you cannot cast or
+// otherwise alias the base NEON data type for intrinsic operations.
+//
+// union intended to allow direct access to an __m128 variable using the names
+// that the MSVC compiler provides.  This union should really only be used when
+// trying to access the members of the vector as integer values.  GCC/clang
+// allow native access to the float members through a simple array access
+// operator (in C since 4.6, in C++ since 4.8).
+//
+// Ideally direct accesses to SIMD vectors should not be used since it can cause
+// a performance hit.  If it really is needed however, the original __m128
+// variable can be aliased with a pointer to this union and used to access
+// individual components.  The use of this union should be hidden behind a macro
+// that is used throughout the codebase to access the members instead of always
+// declaring this type of variable.
+typedef union ALIGN_STRUCT(16) SIMDVec {
+    float m128_f32[4];     // as floats - DON'T USE. Added for convenience.
+    int8_t m128_i8[16];    // as signed 8-bit integers.
+    int16_t m128_i16[8];   // as signed 16-bit integers.
+    int32_t m128_i32[4];   // as signed 32-bit integers.
+    int64_t m128_i64[2];   // as signed 64-bit integers.
+    uint8_t m128_u8[16];   // as unsigned 8-bit integers.
+    uint16_t m128_u16[8];  // as unsigned 16-bit integers.
+    uint32_t m128_u32[4];  // as unsigned 32-bit integers.
+    uint64_t m128_u64[2];  // as unsigned 64-bit integers.
+} SIMDVec;
+
+// casting using SIMDVec
+#define vreinterpretq_nth_u64_m128i(x, n) (((SIMDVec *) &x)->m128_u64[n])
+#define vreinterpretq_nth_u32_m128i(x, n) (((SIMDVec *) &x)->m128_u32[n])
+#define vreinterpretq_nth_u8_m128i(x, n) (((SIMDVec *) &x)->m128_u8[n])
+
+/* SSE macros */
+#define _MM_GET_FLUSH_ZERO_MODE _sse2neon_mm_get_flush_zero_mode
+#define _MM_SET_FLUSH_ZERO_MODE _sse2neon_mm_set_flush_zero_mode
+#define _MM_GET_DENORMALS_ZERO_MODE _sse2neon_mm_get_denormals_zero_mode
+#define _MM_SET_DENORMALS_ZERO_MODE _sse2neon_mm_set_denormals_zero_mode
+
+// Function declaration
+// SSE
+FORCE_INLINE unsigned int _MM_GET_ROUNDING_MODE(void);
+FORCE_INLINE __m128 _mm_move_ss(__m128, __m128);
+FORCE_INLINE __m128 _mm_or_ps(__m128, __m128);
+FORCE_INLINE __m128 _mm_set_ps1(float);
+FORCE_INLINE __m128 _mm_setzero_ps(void);
+// SSE2
+FORCE_INLINE __m128i _mm_and_si128(__m128i, __m128i);
+FORCE_INLINE __m128i _mm_castps_si128(__m128);
+FORCE_INLINE __m128i _mm_cmpeq_epi32(__m128i, __m128i);
+FORCE_INLINE __m128i _mm_cvtps_epi32(__m128);
+FORCE_INLINE __m128d _mm_move_sd(__m128d, __m128d);
+FORCE_INLINE __m128i _mm_or_si128(__m128i, __m128i);
+FORCE_INLINE __m128i _mm_set_epi32(int, int, int, int);
+FORCE_INLINE __m128i _mm_set_epi64x(int64_t, int64_t);
+FORCE_INLINE __m128d _mm_set_pd(double, double);
+FORCE_INLINE __m128i _mm_set1_epi32(int);
+FORCE_INLINE __m128i _mm_setzero_si128(void);
+// SSE4.1
+FORCE_INLINE __m128d _mm_ceil_pd(__m128d);
+FORCE_INLINE __m128 _mm_ceil_ps(__m128);
+FORCE_INLINE __m128d _mm_floor_pd(__m128d);
+FORCE_INLINE __m128 _mm_floor_ps(__m128);
+FORCE_INLINE __m128d _mm_round_pd(__m128d, int);
+FORCE_INLINE __m128 _mm_round_ps(__m128, int);
+// SSE4.2
+FORCE_INLINE uint32_t _mm_crc32_u8(uint32_t, uint8_t);
+
+/* Backwards compatibility for compilers with lack of specific type support */
+
+// Older gcc does not define vld1q_u8_x4 type
+#if defined(__GNUC__) && !defined(__clang__) &&                        \
+    ((__GNUC__ <= 13 && defined(__arm__)) ||                           \
+     (__GNUC__ == 10 && __GNUC_MINOR__ < 3 && defined(__aarch64__)) || \
+     (__GNUC__ <= 9 && defined(__aarch64__)))
+FORCE_INLINE uint8x16x4_t _sse2neon_vld1q_u8_x4(const uint8_t *p)
+{
+    uint8x16x4_t ret;
+    ret.val[0] = vld1q_u8(p + 0);
+    ret.val[1] = vld1q_u8(p + 16);
+    ret.val[2] = vld1q_u8(p + 32);
+    ret.val[3] = vld1q_u8(p + 48);
+    return ret;
+}
+#else
+// Wraps vld1q_u8_x4
+FORCE_INLINE uint8x16x4_t _sse2neon_vld1q_u8_x4(const uint8_t *p)
+{
+    return vld1q_u8_x4(p);
+}
+#endif
+
+#if !defined(__aarch64__) && !defined(_M_ARM64)
+/* emulate vaddv u8 variant */
+FORCE_INLINE uint8_t _sse2neon_vaddv_u8(uint8x8_t v8)
+{
+    const uint64x1_t v1 = vpaddl_u32(vpaddl_u16(vpaddl_u8(v8)));
+    return vget_lane_u8(vreinterpret_u8_u64(v1), 0);
+}
+#else
+// Wraps vaddv_u8
+FORCE_INLINE uint8_t _sse2neon_vaddv_u8(uint8x8_t v8)
+{
+    return vaddv_u8(v8);
+}
+#endif
+
+#if !defined(__aarch64__) && !defined(_M_ARM64)
+/* emulate vaddvq u8 variant */
+FORCE_INLINE uint8_t _sse2neon_vaddvq_u8(uint8x16_t a)
+{
+    uint8x8_t tmp = vpadd_u8(vget_low_u8(a), vget_high_u8(a));
+    uint8_t res = 0;
+    for (int i = 0; i < 8; ++i)
+        res += tmp[i];
+    return res;
+}
+#else
+// Wraps vaddvq_u8
+FORCE_INLINE uint8_t _sse2neon_vaddvq_u8(uint8x16_t a)
+{
+    return vaddvq_u8(a);
+}
+#endif
+
+#if !defined(__aarch64__) && !defined(_M_ARM64)
+/* emulate vaddvq u16 variant */
+FORCE_INLINE uint16_t _sse2neon_vaddvq_u16(uint16x8_t a)
+{
+    uint32x4_t m = vpaddlq_u16(a);
+    uint64x2_t n = vpaddlq_u32(m);
+    uint64x1_t o = vget_low_u64(n) + vget_high_u64(n);
+
+    return vget_lane_u32((uint32x2_t) o, 0);
+}
+#else
+// Wraps vaddvq_u16
+FORCE_INLINE uint16_t _sse2neon_vaddvq_u16(uint16x8_t a)
+{
+    return vaddvq_u16(a);
+}
+#endif
+
+/* Function Naming Conventions
+ * The naming convention of SSE intrinsics is straightforward. A generic SSE
+ * intrinsic function is given as follows:
+ *   _mm_<name>_<data_type>
+ *
+ * The parts of this format are given as follows:
+ * 1. <name> describes the operation performed by the intrinsic
+ * 2. <data_type> identifies the data type of the function's primary arguments
+ *
+ * This last part, <data_type>, is a little complicated. It identifies the
+ * content of the input values, and can be set to any of the following values:
+ * + ps - vectors contain floats (ps stands for packed single-precision)
+ * + pd - vectors contain doubles (pd stands for packed double-precision)
+ * + epi8/epi16/epi32/epi64 - vectors contain 8-bit/16-bit/32-bit/64-bit
+ *                            signed integers
+ * + epu8/epu16/epu32/epu64 - vectors contain 8-bit/16-bit/32-bit/64-bit
+ *                            unsigned integers
+ * + si128 - unspecified 128-bit vector or 256-bit vector
+ * + m128/m128i/m128d - identifies input vector types when they are different
+ *                      than the type of the returned vector
+ *
+ * For example, _mm_setzero_ps. The _mm implies that the function returns
+ * a 128-bit vector. The _ps at the end implies that the argument vectors
+ * contain floats.
+ *
+ * A complete example: Byte Shuffle - pshufb (_mm_shuffle_epi8)
+ *   // Set packed 16-bit integers. 128 bits, 8 short, per 16 bits
+ *   __m128i v_in = _mm_setr_epi16(1, 2, 3, 4, 5, 6, 7, 8);
+ *   // Set packed 8-bit integers
+ *   // 128 bits, 16 chars, per 8 bits
+ *   __m128i v_perm = _mm_setr_epi8(1, 0,  2,  3, 8, 9, 10, 11,
+ *                                  4, 5, 12, 13, 6, 7, 14, 15);
+ *   // Shuffle packed 8-bit integers
+ *   __m128i v_out = _mm_shuffle_epi8(v_in, v_perm); // pshufb
+ */
+
+/* Constants for use with _mm_prefetch. */
+enum _mm_hint {
+    _MM_HINT_NTA = 0, /* load data to L1 and L2 cache, mark it as NTA */
+    _MM_HINT_T0 = 1,  /* load data to L1 and L2 cache */
+    _MM_HINT_T1 = 2,  /* load data to L2 cache only */
+    _MM_HINT_T2 = 3,  /* load data to L2 cache only, mark it as NTA */
+};
+
+// The bit field mapping to the FPCR(floating-point control register)
+typedef struct {
+    uint16_t res0;
+    uint8_t res1 : 6;
+    uint8_t bit22 : 1;
+    uint8_t bit23 : 1;
+    uint8_t bit24 : 1;
+    uint8_t res2 : 7;
+#if defined(__aarch64__) || defined(_M_ARM64)
+    uint32_t res3;
+#endif
+} fpcr_bitfield;
+
+// Takes the upper 64 bits of a and places it in the low end of the result
+// Takes the lower 64 bits of b and places it into the high end of the result.
+FORCE_INLINE __m128 _mm_shuffle_ps_1032(__m128 a, __m128 b)
+{
+    float32x2_t a32 = vget_high_f32(vreinterpretq_f32_m128(a));
+    float32x2_t b10 = vget_low_f32(vreinterpretq_f32_m128(b));
+    return vreinterpretq_m128_f32(vcombine_f32(a32, b10));
+}
+
+// takes the lower two 32-bit values from a and swaps them and places in high
+// end of result takes the higher two 32 bit values from b and swaps them and
+// places in low end of result.
+FORCE_INLINE __m128 _mm_shuffle_ps_2301(__m128 a, __m128 b)
+{
+    float32x2_t a01 = vrev64_f32(vget_low_f32(vreinterpretq_f32_m128(a)));
+    float32x2_t b23 = vrev64_f32(vget_high_f32(vreinterpretq_f32_m128(b)));
+    return vreinterpretq_m128_f32(vcombine_f32(a01, b23));
+}
+
+FORCE_INLINE __m128 _mm_shuffle_ps_0321(__m128 a, __m128 b)
+{
+    float32x2_t a21 = vget_high_f32(
+        vextq_f32(vreinterpretq_f32_m128(a), vreinterpretq_f32_m128(a), 3));
+    float32x2_t b03 = vget_low_f32(
+        vextq_f32(vreinterpretq_f32_m128(b), vreinterpretq_f32_m128(b), 3));
+    return vreinterpretq_m128_f32(vcombine_f32(a21, b03));
+}
+
+FORCE_INLINE __m128 _mm_shuffle_ps_2103(__m128 a, __m128 b)
+{
+    float32x2_t a03 = vget_low_f32(
+        vextq_f32(vreinterpretq_f32_m128(a), vreinterpretq_f32_m128(a), 3));
+    float32x2_t b21 = vget_high_f32(
+        vextq_f32(vreinterpretq_f32_m128(b), vreinterpretq_f32_m128(b), 3));
+    return vreinterpretq_m128_f32(vcombine_f32(a03, b21));
+}
+
+FORCE_INLINE __m128 _mm_shuffle_ps_1010(__m128 a, __m128 b)
+{
+    float32x2_t a10 = vget_low_f32(vreinterpretq_f32_m128(a));
+    float32x2_t b10 = vget_low_f32(vreinterpretq_f32_m128(b));
+    return vreinterpretq_m128_f32(vcombine_f32(a10, b10));
+}
+
+FORCE_INLINE __m128 _mm_shuffle_ps_1001(__m128 a, __m128 b)
+{
+    float32x2_t a01 = vrev64_f32(vget_low_f32(vreinterpretq_f32_m128(a)));
+    float32x2_t b10 = vget_low_f32(vreinterpretq_f32_m128(b));
+    return vreinterpretq_m128_f32(vcombine_f32(a01, b10));
+}
+
+FORCE_INLINE __m128 _mm_shuffle_ps_0101(__m128 a, __m128 b)
+{
+    float32x2_t a01 = vrev64_f32(vget_low_f32(vreinterpretq_f32_m128(a)));
+    float32x2_t b01 = vrev64_f32(vget_low_f32(vreinterpretq_f32_m128(b)));
+    return vreinterpretq_m128_f32(vcombine_f32(a01, b01));
+}
+
+// keeps the low 64 bits of b in the low and puts the high 64 bits of a in the
+// high
+FORCE_INLINE __m128 _mm_shuffle_ps_3210(__m128 a, __m128 b)
+{
+    float32x2_t a10 = vget_low_f32(vreinterpretq_f32_m128(a));
+    float32x2_t b32 = vget_high_f32(vreinterpretq_f32_m128(b));
+    return vreinterpretq_m128_f32(vcombine_f32(a10, b32));
+}
+
+FORCE_INLINE __m128 _mm_shuffle_ps_0011(__m128 a, __m128 b)
+{
+    float32x2_t a11 = vdup_lane_f32(vget_low_f32(vreinterpretq_f32_m128(a)), 1);
+    float32x2_t b00 = vdup_lane_f32(vget_low_f32(vreinterpretq_f32_m128(b)), 0);
+    return vreinterpretq_m128_f32(vcombine_f32(a11, b00));
+}
+
+FORCE_INLINE __m128 _mm_shuffle_ps_0022(__m128 a, __m128 b)
+{
+    float32x2_t a22 =
+        vdup_lane_f32(vget_high_f32(vreinterpretq_f32_m128(a)), 0);
+    float32x2_t b00 = vdup_lane_f32(vget_low_f32(vreinterpretq_f32_m128(b)), 0);
+    return vreinterpretq_m128_f32(vcombine_f32(a22, b00));
+}
+
+FORCE_INLINE __m128 _mm_shuffle_ps_2200(__m128 a, __m128 b)
+{
+    float32x2_t a00 = vdup_lane_f32(vget_low_f32(vreinterpretq_f32_m128(a)), 0);
+    float32x2_t b22 =
+        vdup_lane_f32(vget_high_f32(vreinterpretq_f32_m128(b)), 0);
+    return vreinterpretq_m128_f32(vcombine_f32(a00, b22));
+}
+
+FORCE_INLINE __m128 _mm_shuffle_ps_3202(__m128 a, __m128 b)
+{
+    float32_t a0 = vgetq_lane_f32(vreinterpretq_f32_m128(a), 0);
+    float32x2_t a22 =
+        vdup_lane_f32(vget_high_f32(vreinterpretq_f32_m128(a)), 0);
+    float32x2_t a02 = vset_lane_f32(a0, a22, 1); /* TODO: use vzip ?*/
+    float32x2_t b32 = vget_high_f32(vreinterpretq_f32_m128(b));
+    return vreinterpretq_m128_f32(vcombine_f32(a02, b32));
+}
+
+FORCE_INLINE __m128 _mm_shuffle_ps_1133(__m128 a, __m128 b)
+{
+    float32x2_t a33 =
+        vdup_lane_f32(vget_high_f32(vreinterpretq_f32_m128(a)), 1);
+    float32x2_t b11 = vdup_lane_f32(vget_low_f32(vreinterpretq_f32_m128(b)), 1);
+    return vreinterpretq_m128_f32(vcombine_f32(a33, b11));
+}
+
+FORCE_INLINE __m128 _mm_shuffle_ps_2010(__m128 a, __m128 b)
+{
+    float32x2_t a10 = vget_low_f32(vreinterpretq_f32_m128(a));
+    float32_t b2 = vgetq_lane_f32(vreinterpretq_f32_m128(b), 2);
+    float32x2_t b00 = vdup_lane_f32(vget_low_f32(vreinterpretq_f32_m128(b)), 0);
+    float32x2_t b20 = vset_lane_f32(b2, b00, 1);
+    return vreinterpretq_m128_f32(vcombine_f32(a10, b20));
+}
+
+FORCE_INLINE __m128 _mm_shuffle_ps_2001(__m128 a, __m128 b)
+{
+    float32x2_t a01 = vrev64_f32(vget_low_f32(vreinterpretq_f32_m128(a)));
+    float32_t b2 = vgetq_lane_f32(b, 2);
+    float32x2_t b00 = vdup_lane_f32(vget_low_f32(vreinterpretq_f32_m128(b)), 0);
+    float32x2_t b20 = vset_lane_f32(b2, b00, 1);
+    return vreinterpretq_m128_f32(vcombine_f32(a01, b20));
+}
+
+FORCE_INLINE __m128 _mm_shuffle_ps_2032(__m128 a, __m128 b)
+{
+    float32x2_t a32 = vget_high_f32(vreinterpretq_f32_m128(a));
+    float32_t b2 = vgetq_lane_f32(b, 2);
+    float32x2_t b00 = vdup_lane_f32(vget_low_f32(vreinterpretq_f32_m128(b)), 0);
+    float32x2_t b20 = vset_lane_f32(b2, b00, 1);
+    return vreinterpretq_m128_f32(vcombine_f32(a32, b20));
+}
+
+// For MSVC, we check only if it is ARM64, as every single ARM64 processor
+// supported by WoA has crypto extensions. If this changes in the future,
+// this can be verified via the runtime-only method of:
+// IsProcessorFeaturePresent(PF_ARM_V8_CRYPTO_INSTRUCTIONS_AVAILABLE)
+#if (defined(_M_ARM64) && !defined(__clang__)) || \
+    (defined(__ARM_FEATURE_CRYPTO) &&             \
+     (defined(__aarch64__) || __has_builtin(__builtin_arm_crypto_vmullp64)))
+// Wraps vmull_p64
+FORCE_INLINE uint64x2_t _sse2neon_vmull_p64(uint64x1_t _a, uint64x1_t _b)
+{
+    poly64_t a = vget_lane_p64(vreinterpret_p64_u64(_a), 0);
+    poly64_t b = vget_lane_p64(vreinterpret_p64_u64(_b), 0);
+#if defined(_MSC_VER) && !defined(__clang__)
+    __n64 a1 = {a}, b1 = {b};
+    return vreinterpretq_u64_p128(vmull_p64(a1, b1));
+#else
+    return vreinterpretq_u64_p128(vmull_p64(a, b));
+#endif
+}
+#else  // ARMv7 polyfill
+// ARMv7/some A64 lacks vmull_p64, but it has vmull_p8.
+//
+// vmull_p8 calculates 8 8-bit->16-bit polynomial multiplies, but we need a
+// 64-bit->128-bit polynomial multiply.
+//
+// It needs some work and is somewhat slow, but it is still faster than all
+// known scalar methods.
+//
+// Algorithm adapted to C from
+// https://www.workofard.com/2017/07/ghash-for-low-end-cores/, which is adapted
+// from "Fast Software Polynomial Multiplication on ARM Processors Using the
+// NEON Engine" by Danilo Camara, Conrado Gouvea, Julio Lopez and Ricardo Dahab
+// (https://hal.inria.fr/hal-01506572)
+static uint64x2_t _sse2neon_vmull_p64(uint64x1_t _a, uint64x1_t _b)
+{
+    poly8x8_t a = vreinterpret_p8_u64(_a);
+    poly8x8_t b = vreinterpret_p8_u64(_b);
+
+    // Masks
+    uint8x16_t k48_32 = vcombine_u8(vcreate_u8(0x0000ffffffffffff),
+                                    vcreate_u8(0x00000000ffffffff));
+    uint8x16_t k16_00 = vcombine_u8(vcreate_u8(0x000000000000ffff),
+                                    vcreate_u8(0x0000000000000000));
+
+    // Do the multiplies, rotating with vext to get all combinations
+    uint8x16_t d = vreinterpretq_u8_p16(vmull_p8(a, b));  // D = A0 * B0
+    uint8x16_t e =
+        vreinterpretq_u8_p16(vmull_p8(a, vext_p8(b, b, 1)));  // E = A0 * B1
+    uint8x16_t f =
+        vreinterpretq_u8_p16(vmull_p8(vext_p8(a, a, 1), b));  // F = A1 * B0
+    uint8x16_t g =
+        vreinterpretq_u8_p16(vmull_p8(a, vext_p8(b, b, 2)));  // G = A0 * B2
+    uint8x16_t h =
+        vreinterpretq_u8_p16(vmull_p8(vext_p8(a, a, 2), b));  // H = A2 * B0
+    uint8x16_t i =
+        vreinterpretq_u8_p16(vmull_p8(a, vext_p8(b, b, 3)));  // I = A0 * B3
+    uint8x16_t j =
+        vreinterpretq_u8_p16(vmull_p8(vext_p8(a, a, 3), b));  // J = A3 * B0
+    uint8x16_t k =
+        vreinterpretq_u8_p16(vmull_p8(a, vext_p8(b, b, 4)));  // L = A0 * B4
+
+    // Add cross products
+    uint8x16_t l = veorq_u8(e, f);  // L = E + F
+    uint8x16_t m = veorq_u8(g, h);  // M = G + H
+    uint8x16_t n = veorq_u8(i, j);  // N = I + J
+
+    // Interleave. Using vzip1 and vzip2 prevents Clang from emitting TBL
+    // instructions.
+#if defined(__aarch64__)
+    uint8x16_t lm_p0 = vreinterpretq_u8_u64(
+        vzip1q_u64(vreinterpretq_u64_u8(l), vreinterpretq_u64_u8(m)));
+    uint8x16_t lm_p1 = vreinterpretq_u8_u64(
+        vzip2q_u64(vreinterpretq_u64_u8(l), vreinterpretq_u64_u8(m)));
+    uint8x16_t nk_p0 = vreinterpretq_u8_u64(
+        vzip1q_u64(vreinterpretq_u64_u8(n), vreinterpretq_u64_u8(k)));
+    uint8x16_t nk_p1 = vreinterpretq_u8_u64(
+        vzip2q_u64(vreinterpretq_u64_u8(n), vreinterpretq_u64_u8(k)));
+#else
+    uint8x16_t lm_p0 = vcombine_u8(vget_low_u8(l), vget_low_u8(m));
+    uint8x16_t lm_p1 = vcombine_u8(vget_high_u8(l), vget_high_u8(m));
+    uint8x16_t nk_p0 = vcombine_u8(vget_low_u8(n), vget_low_u8(k));
+    uint8x16_t nk_p1 = vcombine_u8(vget_high_u8(n), vget_high_u8(k));
+#endif
+    // t0 = (L) (P0 + P1) << 8
+    // t1 = (M) (P2 + P3) << 16
+    uint8x16_t t0t1_tmp = veorq_u8(lm_p0, lm_p1);
+    uint8x16_t t0t1_h = vandq_u8(lm_p1, k48_32);
+    uint8x16_t t0t1_l = veorq_u8(t0t1_tmp, t0t1_h);
+
+    // t2 = (N) (P4 + P5) << 24
+    // t3 = (K) (P6 + P7) << 32
+    uint8x16_t t2t3_tmp = veorq_u8(nk_p0, nk_p1);
+    uint8x16_t t2t3_h = vandq_u8(nk_p1, k16_00);
+    uint8x16_t t2t3_l = veorq_u8(t2t3_tmp, t2t3_h);
+
+    // De-interleave
+#if defined(__aarch64__)
+    uint8x16_t t0 = vreinterpretq_u8_u64(
+        vuzp1q_u64(vreinterpretq_u64_u8(t0t1_l), vreinterpretq_u64_u8(t0t1_h)));
+    uint8x16_t t1 = vreinterpretq_u8_u64(
+        vuzp2q_u64(vreinterpretq_u64_u8(t0t1_l), vreinterpretq_u64_u8(t0t1_h)));
+    uint8x16_t t2 = vreinterpretq_u8_u64(
+        vuzp1q_u64(vreinterpretq_u64_u8(t2t3_l), vreinterpretq_u64_u8(t2t3_h)));
+    uint8x16_t t3 = vreinterpretq_u8_u64(
+        vuzp2q_u64(vreinterpretq_u64_u8(t2t3_l), vreinterpretq_u64_u8(t2t3_h)));
+#else
+    uint8x16_t t1 = vcombine_u8(vget_high_u8(t0t1_l), vget_high_u8(t0t1_h));
+    uint8x16_t t0 = vcombine_u8(vget_low_u8(t0t1_l), vget_low_u8(t0t1_h));
+    uint8x16_t t3 = vcombine_u8(vget_high_u8(t2t3_l), vget_high_u8(t2t3_h));
+    uint8x16_t t2 = vcombine_u8(vget_low_u8(t2t3_l), vget_low_u8(t2t3_h));
+#endif
+    // Shift the cross products
+    uint8x16_t t0_shift = vextq_u8(t0, t0, 15);  // t0 << 8
+    uint8x16_t t1_shift = vextq_u8(t1, t1, 14);  // t1 << 16
+    uint8x16_t t2_shift = vextq_u8(t2, t2, 13);  // t2 << 24
+    uint8x16_t t3_shift = vextq_u8(t3, t3, 12);  // t3 << 32
+
+    // Accumulate the products
+    uint8x16_t cross1 = veorq_u8(t0_shift, t1_shift);
+    uint8x16_t cross2 = veorq_u8(t2_shift, t3_shift);
+    uint8x16_t mix = veorq_u8(d, cross1);
+    uint8x16_t r = veorq_u8(mix, cross2);
+    return vreinterpretq_u64_u8(r);
+}
+#endif  // ARMv7 polyfill
+
+// C equivalent:
+//   __m128i _mm_shuffle_epi32_default(__m128i a,
+//                                     __constrange(0, 255) int imm) {
+//       __m128i ret;
+//       ret[0] = a[imm        & 0x3];   ret[1] = a[(imm >> 2) & 0x3];
+//       ret[2] = a[(imm >> 4) & 0x03];  ret[3] = a[(imm >> 6) & 0x03];
+//       return ret;
+//   }
+#define _mm_shuffle_epi32_default(a, imm)                                   \
+    vreinterpretq_m128i_s32(vsetq_lane_s32(                                 \
+        vgetq_lane_s32(vreinterpretq_s32_m128i(a), ((imm) >> 6) & 0x3),     \
+        vsetq_lane_s32(                                                     \
+            vgetq_lane_s32(vreinterpretq_s32_m128i(a), ((imm) >> 4) & 0x3), \
+            vsetq_lane_s32(vgetq_lane_s32(vreinterpretq_s32_m128i(a),       \
+                                          ((imm) >> 2) & 0x3),              \
+                           vmovq_n_s32(vgetq_lane_s32(                      \
+                               vreinterpretq_s32_m128i(a), (imm) & (0x3))), \
+                           1),                                              \
+            2),                                                             \
+        3))
+
+// Takes the upper 64 bits of a and places it in the low end of the result
+// Takes the lower 64 bits of a and places it into the high end of the result.
+FORCE_INLINE __m128i _mm_shuffle_epi_1032(__m128i a)
+{
+    int32x2_t a32 = vget_high_s32(vreinterpretq_s32_m128i(a));
+    int32x2_t a10 = vget_low_s32(vreinterpretq_s32_m128i(a));
+    return vreinterpretq_m128i_s32(vcombine_s32(a32, a10));
+}
+
+// takes the lower two 32-bit values from a and swaps them and places in low end
+// of result takes the higher two 32 bit values from a and swaps them and places
+// in high end of result.
+FORCE_INLINE __m128i _mm_shuffle_epi_2301(__m128i a)
+{
+    int32x2_t a01 = vrev64_s32(vget_low_s32(vreinterpretq_s32_m128i(a)));
+    int32x2_t a23 = vrev64_s32(vget_high_s32(vreinterpretq_s32_m128i(a)));
+    return vreinterpretq_m128i_s32(vcombine_s32(a01, a23));
+}
+
+// rotates the least significant 32 bits into the most significant 32 bits, and
+// shifts the rest down
+FORCE_INLINE __m128i _mm_shuffle_epi_0321(__m128i a)
+{
+    return vreinterpretq_m128i_s32(
+        vextq_s32(vreinterpretq_s32_m128i(a), vreinterpretq_s32_m128i(a), 1));
+}
+
+// rotates the most significant 32 bits into the least significant 32 bits, and
+// shifts the rest up
+FORCE_INLINE __m128i _mm_shuffle_epi_2103(__m128i a)
+{
+    return vreinterpretq_m128i_s32(
+        vextq_s32(vreinterpretq_s32_m128i(a), vreinterpretq_s32_m128i(a), 3));
+}
+
+// gets the lower 64 bits of a, and places it in the upper 64 bits
+// gets the lower 64 bits of a and places it in the lower 64 bits
+FORCE_INLINE __m128i _mm_shuffle_epi_1010(__m128i a)
+{
+    int32x2_t a10 = vget_low_s32(vreinterpretq_s32_m128i(a));
+    return vreinterpretq_m128i_s32(vcombine_s32(a10, a10));
+}
+
+// gets the lower 64 bits of a, swaps the 0 and 1 elements, and places it in the
+// lower 64 bits gets the lower 64 bits of a, and places it in the upper 64 bits
+FORCE_INLINE __m128i _mm_shuffle_epi_1001(__m128i a)
+{
+    int32x2_t a01 = vrev64_s32(vget_low_s32(vreinterpretq_s32_m128i(a)));
+    int32x2_t a10 = vget_low_s32(vreinterpretq_s32_m128i(a));
+    return vreinterpretq_m128i_s32(vcombine_s32(a01, a10));
+}
+
+// gets the lower 64 bits of a, swaps the 0 and 1 elements and places it in the
+// upper 64 bits gets the lower 64 bits of a, swaps the 0 and 1 elements, and
+// places it in the lower 64 bits
+FORCE_INLINE __m128i _mm_shuffle_epi_0101(__m128i a)
+{
+    int32x2_t a01 = vrev64_s32(vget_low_s32(vreinterpretq_s32_m128i(a)));
+    return vreinterpretq_m128i_s32(vcombine_s32(a01, a01));
+}
+
+FORCE_INLINE __m128i _mm_shuffle_epi_2211(__m128i a)
+{
+    int32x2_t a11 = vdup_lane_s32(vget_low_s32(vreinterpretq_s32_m128i(a)), 1);
+    int32x2_t a22 = vdup_lane_s32(vget_high_s32(vreinterpretq_s32_m128i(a)), 0);
+    return vreinterpretq_m128i_s32(vcombine_s32(a11, a22));
+}
+
+FORCE_INLINE __m128i _mm_shuffle_epi_0122(__m128i a)
+{
+    int32x2_t a22 = vdup_lane_s32(vget_high_s32(vreinterpretq_s32_m128i(a)), 0);
+    int32x2_t a01 = vrev64_s32(vget_low_s32(vreinterpretq_s32_m128i(a)));
+    return vreinterpretq_m128i_s32(vcombine_s32(a22, a01));
+}
+
+FORCE_INLINE __m128i _mm_shuffle_epi_3332(__m128i a)
+{
+    int32x2_t a32 = vget_high_s32(vreinterpretq_s32_m128i(a));
+    int32x2_t a33 = vdup_lane_s32(vget_high_s32(vreinterpretq_s32_m128i(a)), 1);
+    return vreinterpretq_m128i_s32(vcombine_s32(a32, a33));
+}
+
+#if defined(__aarch64__) || defined(_M_ARM64)
+#define _mm_shuffle_epi32_splat(a, imm) \
+    vreinterpretq_m128i_s32(vdupq_laneq_s32(vreinterpretq_s32_m128i(a), (imm)))
+#else
+#define _mm_shuffle_epi32_splat(a, imm) \
+    vreinterpretq_m128i_s32(            \
+        vdupq_n_s32(vgetq_lane_s32(vreinterpretq_s32_m128i(a), (imm))))
+#endif
+
+// NEON does not support a general purpose permute intrinsic.
+// Shuffle single-precision (32-bit) floating-point elements in a using the
+// control in imm8, and store the results in dst.
+//
+// C equivalent:
+//   __m128 _mm_shuffle_ps_default(__m128 a, __m128 b,
+//                                 __constrange(0, 255) int imm) {
+//       __m128 ret;
+//       ret[0] = a[imm        & 0x3];   ret[1] = a[(imm >> 2) & 0x3];
+//       ret[2] = b[(imm >> 4) & 0x03];  ret[3] = b[(imm >> 6) & 0x03];
+//       return ret;
+//   }
+//
+// https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_shuffle_ps
+#define _mm_shuffle_ps_default(a, b, imm)                                      \
+    vreinterpretq_m128_f32(vsetq_lane_f32(                                     \
+        vgetq_lane_f32(vreinterpretq_f32_m128(b), ((imm) >> 6) & 0x3),         \
+        vsetq_lane_f32(                                                        \
+            vgetq_lane_f32(vreinterpretq_f32_m128(b), ((imm) >> 4) & 0x3),     \
+            vsetq_lane_f32(                                                    \
+                vgetq_lane_f32(vreinterpretq_f32_m128(a), ((imm) >> 2) & 0x3), \
+                vmovq_n_f32(                                                   \
+                    vgetq_lane_f32(vreinterpretq_f32_m128(a), (imm) & (0x3))), \
+                1),                                                            \
+            2),                                                                \
+        3))
+
+// Shuffle 16-bit integers in the low 64 bits of a using the control in imm8.
+// Store the results in the low 64 bits of dst, with the high 64 bits being
+// copied from a to dst.
+// https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_shufflelo_epi16
+#define _mm_shufflelo_epi16_function(a, imm)                                  \
+    _sse2neon_define1(                                                        \
+        __m128i, a, int16x8_t ret = vreinterpretq_s16_m128i(_a);              \
+        int16x4_t lowBits = vget_low_s16(ret);                                \
+        ret = vsetq_lane_s16(vget_lane_s16(lowBits, (imm) & (0x3)), ret, 0);  \
+        ret = vsetq_lane_s16(vget_lane_s16(lowBits, ((imm) >> 2) & 0x3), ret, \
+                             1);                                              \
+        ret = vsetq_lane_s16(vget_lane_s16(lowBits, ((imm) >> 4) & 0x3), ret, \
+                             2);                                              \
+        ret = vsetq_lane_s16(vget_lane_s16(lowBits, ((imm) >> 6) & 0x3), ret, \
+                             3);                                              \
+        _sse2neon_return(vreinterpretq_m128i_s16(ret));)
+
+// Shuffle 16-bit integers in the high 64 bits of a using the control in imm8.
+// Store the results in the high 64 bits of dst, with the low 64 bits being
+// copied from a to dst.
+// https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_shufflehi_epi16
+#define _mm_shufflehi_epi16_function(a, imm)                                   \
+    _sse2neon_define1(                                                         \
+        __m128i, a, int16x8_t ret = vreinterpretq_s16_m128i(_a);               \
+        int16x4_t highBits = vget_high_s16(ret);                               \
+        ret = vsetq_lane_s16(vget_lane_s16(highBits, (imm) & (0x3)), ret, 4);  \
+        ret = vsetq_lane_s16(vget_lane_s16(highBits, ((imm) >> 2) & 0x3), ret, \
+                             5);                                               \
+        ret = vsetq_lane_s16(vget_lane_s16(highBits, ((imm) >> 4) & 0x3), ret, \
+                             6);                                               \
+        ret = vsetq_lane_s16(vget_lane_s16(highBits, ((imm) >> 6) & 0x3), ret, \
+                             7);                                               \
+        _sse2neon_return(vreinterpretq_m128i_s16(ret));)
+
+/* MMX */
+
+//_mm_empty is a no-op on arm
+FORCE_INLINE void _mm_empty(void) {}
+
+/* SSE */
+
+// Add packed single-precision (32-bit) floating-point elements in a and b, and
+// store the results in dst.
+// https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_add_ps
+FORCE_INLINE __m128 _mm_add_ps(__m128 a, __m128 b)
+{
+    return vreinterpretq_m128_f32(
+        vaddq_f32(vreinterpretq_f32_m128(a), vreinterpretq_f32_m128(b)));
+}
+
+// Add the lower single-precision (32-bit) floating-point element in a and b,
+// store the result in the lower element of dst, and copy the upper 3 packed
+// elements from a to the upper elements of dst.
+// https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_add_ss
+FORCE_INLINE __m128 _mm_add_ss(__m128 a, __m128 b)
+{
+    float32_t b0 = vgetq_lane_f32(vreinterpretq_f32_m128(b), 0);
+    float32x4_t value = vsetq_lane_f32(b0, vdupq_n_f32(0), 0);
+    // the upper values in the result must be the remnants of <a>.
+    return vreinterpretq_m128_f32(vaddq_f32(a, value));
+}
+
+// Compute the bitwise AND of packed single-precision (32-bit) floating-point
+// elements in a and b, and store the results in dst.
+// https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_and_ps
+FORCE_INLINE __m128 _mm_and_ps(__m128 a, __m128 b)
+{
+    return vreinterpretq_m128_s32(
+        vandq_s32(vreinterpretq_s32_m128(a), vreinterpretq_s32_m128(b)));
+}
+
+// Compute the bitwise NOT of packed single-precision (32-bit) floating-point
+// elements in a and then AND with b, and store the results in dst.
+// https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_andnot_ps
+FORCE_INLINE __m128 _mm_andnot_ps(__m128 a, __m128 b)
+{
+    return vreinterpretq_m128_s32(
+        vbicq_s32(vreinterpretq_s32_m128(b),
+                  vreinterpretq_s32_m128(a)));  // *NOTE* argument swap
+}
+
+// Average packed unsigned 16-bit integers in a and b, and store the results in
+// dst.
+// https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_avg_pu16
+FORCE_INLINE __m64 _mm_avg_pu16(__m64 a, __m64 b)
+{
+    return vreinterpret_m64_u16(
+        vrhadd_u16(vreinterpret_u16_m64(a), vreinterpret_u16_m64(b)));
+}
+
+// Average packed unsigned 8-bit integers in a and b, and store the results in
+// dst.
+// https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_avg_pu8
+FORCE_INLINE __m64 _mm_avg_pu8(__m64 a, __m64 b)
+{
+    return vreinterpret_m64_u8(
+        vrhadd_u8(vreinterpret_u8_m64(a), vreinterpret_u8_m64(b)));
+}
+
+// Compare packed single-precision (32-bit) floating-point elements in a and b
+// for equality, and store the results in dst.
+// https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_cmpeq_ps
+FORCE_INLINE __m128 _mm_cmpeq_ps(__m128 a, __m128 b)
+{
+    return vreinterpretq_m128_u32(
+        vceqq_f32(vreinterpretq_f32_m128(a), vreinterpretq_f32_m128(b)));
+}
+
+// Compare the lower single-precision (32-bit) floating-point elements in a and
+// b for equality, store the result in the lower element of dst, and copy the
+// upper 3 packed elements from a to the upper elements of dst.
+// https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_cmpeq_ss
+FORCE_INLINE __m128 _mm_cmpeq_ss(__m128 a, __m128 b)
+{
+    return _mm_move_ss(a, _mm_cmpeq_ps(a, b));
+}
+
+// Compare packed single-precision (32-bit) floating-point elements in a and b
+// for greater-than-or-equal, and store the results in dst.
+// https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_cmpge_ps
+FORCE_INLINE __m128 _mm_cmpge_ps(__m128 a, __m128 b)
+{
+    return vreinterpretq_m128_u32(
+        vcgeq_f32(vreinterpretq_f32_m128(a), vreinterpretq_f32_m128(b)));
+}
+
+// Compare the lower single-precision (32-bit) floating-point elements in a and
+// b for greater-than-or-equal, store the result in the lower element of dst,
+// and copy the upper 3 packed elements from a to the upper elements of dst.
+// https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_cmpge_ss
+FORCE_INLINE __m128 _mm_cmpge_ss(__m128 a, __m128 b)
+{
+    return _mm_move_ss(a, _mm_cmpge_ps(a, b));
+}
+
+// Compare packed single-precision (32-bit) floating-point elements in a and b
+// for greater-than, and store the results in dst.
+// https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_cmpgt_ps
+FORCE_INLINE __m128 _mm_cmpgt_ps(__m128 a, __m128 b)
+{
+    return vreinterpretq_m128_u32(
+        vcgtq_f32(vreinterpretq_f32_m128(a), vreinterpretq_f32_m128(b)));
+}
+
+// Compare the lower single-precision (32-bit) floating-point elements in a and
+// b for greater-than, store the result in the lower element of dst, and copy
+// the upper 3 packed elements from a to the upper elements of dst.
+// https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_cmpgt_ss
+FORCE_INLINE __m128 _mm_cmpgt_ss(__m128 a, __m128 b)
+{
+    return _mm_move_ss(a, _mm_cmpgt_ps(a, b));
+}
+
+// Compare packed single-precision (32-bit) floating-point elements in a and b
+// for less-than-or-equal, and store the results in dst.
+// https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_cmple_ps
+FORCE_INLINE __m128 _mm_cmple_ps(__m128 a, __m128 b)
+{
+    return vreinterpretq_m128_u32(
+        vcleq_f32(vreinterpretq_f32_m128(a), vreinterpretq_f32_m128(b)));
+}
+
+// Compare the lower single-precision (32-bit) floating-point elements in a and
+// b for less-than-or-equal, store the result in the lower element of dst, and
+// copy the upper 3 packed elements from a to the upper elements of dst.
+// https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_cmple_ss
+FORCE_INLINE __m128 _mm_cmple_ss(__m128 a, __m128 b)
+{
+    return _mm_move_ss(a, _mm_cmple_ps(a, b));
+}
+
+// Compare packed single-precision (32-bit) floating-point elements in a and b
+// for less-than, and store the results in dst.
+// https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_cmplt_ps
+FORCE_INLINE __m128 _mm_cmplt_ps(__m128 a, __m128 b)
+{
+    return vreinterpretq_m128_u32(
+        vcltq_f32(vreinterpretq_f32_m128(a), vreinterpretq_f32_m128(b)));
+}
+
+// Compare the lower single-precision (32-bit) floating-point elements in a and
+// b for less-than, store the result in the lower element of dst, and copy the
+// upper 3 packed elements from a to the upper elements of dst.
+// https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_cmplt_ss
+FORCE_INLINE __m128 _mm_cmplt_ss(__m128 a, __m128 b)
+{
+    return _mm_move_ss(a, _mm_cmplt_ps(a, b));
+}
+
+// Compare packed single-precision (32-bit) floating-point elements in a and b
+// for not-equal, and store the results in dst.
+// https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_cmpneq_ps
+FORCE_INLINE __m128 _mm_cmpneq_ps(__m128 a, __m128 b)
+{
+    return vreinterpretq_m128_u32(vmvnq_u32(
+        vceqq_f32(vreinterpretq_f32_m128(a), vreinterpretq_f32_m128(b))));
+}
+
+// Compare the lower single-precision (32-bit) floating-point elements in a and
+// b for not-equal, store the result in the lower element of dst, and copy the
+// upper 3 packed elements from a to the upper elements of dst.
+// https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_cmpneq_ss
+FORCE_INLINE __m128 _mm_cmpneq_ss(__m128 a, __m128 b)
+{
+    return _mm_move_ss(a, _mm_cmpneq_ps(a, b));
+}
+
+// Compare packed single-precision (32-bit) floating-point elements in a and b
+// for not-greater-than-or-equal, and store the results in dst.
+// https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_cmpnge_ps
+FORCE_INLINE __m128 _mm_cmpnge_ps(__m128 a, __m128 b)
+{
+    return vreinterpretq_m128_u32(vmvnq_u32(
+        vcgeq_f32(vreinterpretq_f32_m128(a), vreinterpretq_f32_m128(b))));
+}
+
+// Compare the lower single-precision (32-bit) floating-point elements in a and
+// b for not-greater-than-or-equal, store the result in the lower element of
+// dst, and copy the upper 3 packed elements from a to the upper elements of
+// dst.
+// https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_cmpnge_ss
+FORCE_INLINE __m128 _mm_cmpnge_ss(__m128 a, __m128 b)
+{
+    return _mm_move_ss(a, _mm_cmpnge_ps(a, b));
+}
+
+// Compare packed single-precision (32-bit) floating-point elements in a and b
+// for not-greater-than, and store the results in dst.
+// https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_cmpngt_ps
+FORCE_INLINE __m128 _mm_cmpngt_ps(__m128 a, __m128 b)
+{
+    return vreinterpretq_m128_u32(vmvnq_u32(
+        vcgtq_f32(vreinterpretq_f32_m128(a), vreinterpretq_f32_m128(b))));
+}
+
+// Compare the lower single-precision (32-bit) floating-point elements in a and
+// b for not-greater-than, store the result in the lower element of dst, and
+// copy the upper 3 packed elements from a to the upper elements of dst.
+// https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_cmpngt_ss
+FORCE_INLINE __m128 _mm_cmpngt_ss(__m128 a, __m128 b)
+{
+    return _mm_move_ss(a, _mm_cmpngt_ps(a, b));
+}
+
+// Compare packed single-precision (32-bit) floating-point elements in a and b
+// for not-less-than-or-equal, and store the results in dst.
+// https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_cmpnle_ps
+FORCE_INLINE __m128 _mm_cmpnle_ps(__m128 a, __m128 b)
+{
+    return vreinterpretq_m128_u32(vmvnq_u32(
+        vcleq_f32(vreinterpretq_f32_m128(a), vreinterpretq_f32_m128(b))));
+}
+
+// Compare the lower single-precision (32-bit) floating-point elements in a and
+// b for not-less-than-or-equal, store the result in the lower element of dst,
+// and copy the upper 3 packed elements from a to the upper elements of dst.
+// https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_cmpnle_ss
+FORCE_INLINE __m128 _mm_cmpnle_ss(__m128 a, __m128 b)
+{
+    return _mm_move_ss(a, _mm_cmpnle_ps(a, b));
+}
+
+// Compare packed single-precision (32-bit) floating-point elements in a and b
+// for not-less-than, and store the results in dst.
+// https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_cmpnlt_ps
+FORCE_INLINE __m128 _mm_cmpnlt_ps(__m128 a, __m128 b)
+{
+    return vreinterpretq_m128_u32(vmvnq_u32(
+        vcltq_f32(vreinterpretq_f32_m128(a), vreinterpretq_f32_m128(b))));
+}
+
+// Compare the lower single-precision (32-bit) floating-point elements in a and
+// b for not-less-than, store the result in the lower element of dst, and copy
+// the upper 3 packed elements from a to the upper elements of dst.
+// https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_cmpnlt_ss
+FORCE_INLINE __m128 _mm_cmpnlt_ss(__m128 a, __m128 b)
+{
+    return _mm_move_ss(a, _mm_cmpnlt_ps(a, b));
+}
+
+// Compare packed single-precision (32-bit) floating-point elements in a and b
+// to see if neither is NaN, and store the results in dst.
+// https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_cmpord_ps
+//
+// See also:
+// http://stackoverflow.com/questions/8627331/what-does-ordered-unordered-comparison-mean
+// http://stackoverflow.com/questions/29349621/neon-isnanval-intrinsics
+FORCE_INLINE __m128 _mm_cmpord_ps(__m128 a, __m128 b)
+{
+    // Note: NEON does not have ordered compare builtin
+    // Need to compare a eq a and b eq b to check for NaN
+    // Do AND of results to get final
+    uint32x4_t ceqaa =
+        vceqq_f32(vreinterpretq_f32_m128(a), vreinterpretq_f32_m128(a));
+    uint32x4_t ceqbb =
+        vceqq_f32(vreinterpretq_f32_m128(b), vreinterpretq_f32_m128(b));
+    return vreinterpretq_m128_u32(vandq_u32(ceqaa, ceqbb));
+}
+
+// Compare the lower single-precision (32-bit) floating-point elements in a and
+// b to see if neither is NaN, store the result in the lower element of dst, and
+// copy the upper 3 packed elements from a to the upper elements of dst.
+// https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_cmpord_ss
+FORCE_INLINE __m128 _mm_cmpord_ss(__m128 a, __m128 b)
+{
+    return _mm_move_ss(a, _mm_cmpord_ps(a, b));
+}
+
+// Compare packed single-precision (32-bit) floating-point elements in a and b
+// to see if either is NaN, and store the results in dst.
+// https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_cmpunord_ps
+FORCE_INLINE __m128 _mm_cmpunord_ps(__m128 a, __m128 b)
+{
+    uint32x4_t f32a =
+        vceqq_f32(vreinterpretq_f32_m128(a), vreinterpretq_f32_m128(a));
+    uint32x4_t f32b =
+        vceqq_f32(vreinterpretq_f32_m128(b), vreinterpretq_f32_m128(b));
+    return vreinterpretq_m128_u32(vmvnq_u32(vandq_u32(f32a, f32b)));
+}
+
+// Compare the lower single-precision (32-bit) floating-point elements in a and
+// b to see if either is NaN, store the result in the lower element of dst, and
+// copy the upper 3 packed elements from a to the upper elements of dst.
+// https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_cmpunord_ss
+FORCE_INLINE __m128 _mm_cmpunord_ss(__m128 a, __m128 b)
+{
+    return _mm_move_ss(a, _mm_cmpunord_ps(a, b));
+}
+
+// Compare the lower single-precision (32-bit) floating-point element in a and b
+// for equality, and return the boolean result (0 or 1).
+// https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_comieq_ss
+FORCE_INLINE int _mm_comieq_ss(__m128 a, __m128 b)
+{
+    uint32x4_t a_eq_b =
+        vceqq_f32(vreinterpretq_f32_m128(a), vreinterpretq_f32_m128(b));
+    return vgetq_lane_u32(a_eq_b, 0) & 0x1;
+}
+
+// Compare the lower single-precision (32-bit) floating-point element in a and b
+// for greater-than-or-equal, and return the boolean result (0 or 1).
+// https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_comige_ss
+FORCE_INLINE int _mm_comige_ss(__m128 a, __m128 b)
+{
+    uint32x4_t a_ge_b =
+        vcgeq_f32(vreinterpretq_f32_m128(a), vreinterpretq_f32_m128(b));
+    return vgetq_lane_u32(a_ge_b, 0) & 0x1;
+}
+
+// Compare the lower single-precision (32-bit) floating-point element in a and b
+// for greater-than, and return the boolean result (0 or 1).
+// https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_comigt_ss
+FORCE_INLINE int _mm_comigt_ss(__m128 a, __m128 b)
+{
+    uint32x4_t a_gt_b =
+        vcgtq_f32(vreinterpretq_f32_m128(a), vreinterpretq_f32_m128(b));
+    return vgetq_lane_u32(a_gt_b, 0) & 0x1;
+}
+
+// Compare the lower single-precision (32-bit) floating-point element in a and b
+// for less-than-or-equal, and return the boolean result (0 or 1).
+// https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_comile_ss
+FORCE_INLINE int _mm_comile_ss(__m128 a, __m128 b)
+{
+    uint32x4_t a_le_b =
+        vcleq_f32(vreinterpretq_f32_m128(a), vreinterpretq_f32_m128(b));
+    return vgetq_lane_u32(a_le_b, 0) & 0x1;
+}
+
+// Compare the lower single-precision (32-bit) floating-point element in a and b
+// for less-than, and return the boolean result (0 or 1).
+// https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_comilt_ss
+FORCE_INLINE int _mm_comilt_ss(__m128 a, __m128 b)
+{
+    uint32x4_t a_lt_b =
+        vcltq_f32(vreinterpretq_f32_m128(a), vreinterpretq_f32_m128(b));
+    return vgetq_lane_u32(a_lt_b, 0) & 0x1;
+}
+
+// Compare the lower single-precision (32-bit) floating-point element in a and b
+// for not-equal, and return the boolean result (0 or 1).
+// https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_comineq_ss
+FORCE_INLINE int _mm_comineq_ss(__m128 a, __m128 b)
+{
+    return !_mm_comieq_ss(a, b);
+}
+
+// Convert packed signed 32-bit integers in b to packed single-precision
+// (32-bit) floating-point elements, store the results in the lower 2 elements
+// of dst, and copy the upper 2 packed elements from a to the upper elements of
+// dst.
+// https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_cvt_pi2ps
+FORCE_INLINE __m128 _mm_cvt_pi2ps(__m128 a, __m64 b)
+{
+    return vreinterpretq_m128_f32(
+        vcombine_f32(vcvt_f32_s32(vreinterpret_s32_m64(b)),
+                     vget_high_f32(vreinterpretq_f32_m128(a))));
+}
+
+// Convert packed single-precision (32-bit) floating-point elements in a to
+// packed 32-bit integers, and store the results in dst.
+// https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_cvt_ps2pi
+FORCE_INLINE __m64 _mm_cvt_ps2pi(__m128 a)
+{
+#if (defined(__aarch64__) || defined(_M_ARM64)) || \
+    defined(__ARM_FEATURE_DIRECTED_ROUNDING)
+    return vreinterpret_m64_s32(
+        vget_low_s32(vcvtnq_s32_f32(vrndiq_f32(vreinterpretq_f32_m128(a)))));
+#else
+    return vreinterpret_m64_s32(vcvt_s32_f32(vget_low_f32(
+        vreinterpretq_f32_m128(_mm_round_ps(a, _MM_FROUND_CUR_DIRECTION)))));
+#endif
+}
+
+// Convert the signed 32-bit integer b to a single-precision (32-bit)
+// floating-point element, store the result in the lower element of dst, and
+// copy the upper 3 packed elements from a to the upper elements of dst.
+// https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_cvt_si2ss
+FORCE_INLINE __m128 _mm_cvt_si2ss(__m128 a, int b)
+{
+    return vreinterpretq_m128_f32(
+        vsetq_lane_f32((float) b, vreinterpretq_f32_m128(a), 0));
+}
+
+// Convert the lower single-precision (32-bit) floating-point element in a to a
+// 32-bit integer, and store the result in dst.
+// https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_cvt_ss2si
+FORCE_INLINE int _mm_cvt_ss2si(__m128 a)
+{
+#if (defined(__aarch64__) || defined(_M_ARM64)) || \
+    defined(__ARM_FEATURE_DIRECTED_ROUNDING)
+    return vgetq_lane_s32(vcvtnq_s32_f32(vrndiq_f32(vreinterpretq_f32_m128(a))),
+                          0);
+#else
+    float32_t data = vgetq_lane_f32(
+        vreinterpretq_f32_m128(_mm_round_ps(a, _MM_FROUND_CUR_DIRECTION)), 0);
+    return (int32_t) data;
+#endif
+}
+
+// Convert packed 16-bit integers in a to packed single-precision (32-bit)
+// floating-point elements, and store the results in dst.
+// https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_cvtpi16_ps
+FORCE_INLINE __m128 _mm_cvtpi16_ps(__m64 a)
+{
+    return vreinterpretq_m128_f32(
+        vcvtq_f32_s32(vmovl_s16(vreinterpret_s16_m64(a))));
+}
+
+// Convert packed 32-bit integers in b to packed single-precision (32-bit)
+// floating-point elements, store the results in the lower 2 elements of dst,
+// and copy the upper 2 packed elements from a to the upper elements of dst.
+// https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_cvtpi32_ps
+FORCE_INLINE __m128 _mm_cvtpi32_ps(__m128 a, __m64 b)
+{
+    return vreinterpretq_m128_f32(
+        vcombine_f32(vcvt_f32_s32(vreinterpret_s32_m64(b)),
+                     vget_high_f32(vreinterpretq_f32_m128(a))));
+}
+
+// Convert packed signed 32-bit integers in a to packed single-precision
+// (32-bit) floating-point elements, store the results in the lower 2 elements
+// of dst, then convert the packed signed 32-bit integers in b to
+// single-precision (32-bit) floating-point element, and store the results in
+// the upper 2 elements of dst.
+// https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_cvtpi32x2_ps
+FORCE_INLINE __m128 _mm_cvtpi32x2_ps(__m64 a, __m64 b)
+{
+    return vreinterpretq_m128_f32(vcvtq_f32_s32(
+        vcombine_s32(vreinterpret_s32_m64(a), vreinterpret_s32_m64(b))));
+}
+
+// Convert the lower packed 8-bit integers in a to packed single-precision
+// (32-bit) floating-point elements, and store the results in dst.
+// https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_cvtpi8_ps
+FORCE_INLINE __m128 _mm_cvtpi8_ps(__m64 a)
+{
+    return vreinterpretq_m128_f32(vcvtq_f32_s32(
+        vmovl_s16(vget_low_s16(vmovl_s8(vreinterpret_s8_m64(a))))));
+}
+
+// Convert packed single-precision (32-bit) floating-point elements in a to
+// packed 16-bit integers, and store the results in dst. Note: this intrinsic
+// will generate 0x7FFF, rather than 0x8000, for input values between 0x7FFF and
+// 0x7FFFFFFF.
+// https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_cvtps_pi16
+FORCE_INLINE __m64 _mm_cvtps_pi16(__m128 a)
+{
+    return vreinterpret_m64_s16(
+        vqmovn_s32(vreinterpretq_s32_m128i(_mm_cvtps_epi32(a))));
+}
+
+// Convert packed single-precision (32-bit) floating-point elements in a to
+// packed 32-bit integers, and store the results in dst.
+// https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_cvtps_pi32
+#define _mm_cvtps_pi32(a) _mm_cvt_ps2pi(a)
+
+// Convert packed single-precision (32-bit) floating-point elements in a to
+// packed 8-bit integers, and store the results in lower 4 elements of dst.
+// Note: this intrinsic will generate 0x7F, rather than 0x80, for input values
+// between 0x7F and 0x7FFFFFFF.
+// https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_cvtps_pi8
+FORCE_INLINE __m64 _mm_cvtps_pi8(__m128 a)
+{
+    return vreinterpret_m64_s8(vqmovn_s16(
+        vcombine_s16(vreinterpret_s16_m64(_mm_cvtps_pi16(a)), vdup_n_s16(0))));
+}
+
+// Convert packed unsigned 16-bit integers in a to packed single-precision
+// (32-bit) floating-point elements, and store the results in dst.
+// https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_cvtpu16_ps
+FORCE_INLINE __m128 _mm_cvtpu16_ps(__m64 a)
+{
+    return vreinterpretq_m128_f32(
+        vcvtq_f32_u32(vmovl_u16(vreinterpret_u16_m64(a))));
+}
+
+// Convert the lower packed unsigned 8-bit integers in a to packed
+// single-precision (32-bit) floating-point elements, and store the results in
+// dst.
+// https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_cvtpu8_ps
+FORCE_INLINE __m128 _mm_cvtpu8_ps(__m64 a)
+{
+    return vreinterpretq_m128_f32(vcvtq_f32_u32(
+        vmovl_u16(vget_low_u16(vmovl_u8(vreinterpret_u8_m64(a))))));
+}
+
+// Convert the signed 32-bit integer b to a single-precision (32-bit)
+// floating-point element, store the result in the lower element of dst, and
+// copy the upper 3 packed elements from a to the upper elements of dst.
+// https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_cvtsi32_ss
+#define _mm_cvtsi32_ss(a, b) _mm_cvt_si2ss(a, b)
+
+// Convert the signed 64-bit integer b to a single-precision (32-bit)
+// floating-point element, store the result in the lower element of dst, and
+// copy the upper 3 packed elements from a to the upper elements of dst.
+// https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_cvtsi64_ss
+FORCE_INLINE __m128 _mm_cvtsi64_ss(__m128 a, int64_t b)
+{
+    return vreinterpretq_m128_f32(
+        vsetq_lane_f32((float) b, vreinterpretq_f32_m128(a), 0));
+}
+
+// Copy the lower single-precision (32-bit) floating-point element of a to dst.
+// https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_cvtss_f32
+FORCE_INLINE float _mm_cvtss_f32(__m128 a)
+{
+    return vgetq_lane_f32(vreinterpretq_f32_m128(a), 0);
+}
+
+// Convert the lower single-precision (32-bit) floating-point element in a to a
+// 32-bit integer, and store the result in dst.
+// https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_cvtss_si32
+#define _mm_cvtss_si32(a) _mm_cvt_ss2si(a)
+
+// Convert the lower single-precision (32-bit) floating-point element in a to a
+// 64-bit integer, and store the result in dst.
+// https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_cvtss_si64
+FORCE_INLINE int64_t _mm_cvtss_si64(__m128 a)
+{
+#if (defined(__aarch64__) || defined(_M_ARM64)) || \
+    defined(__ARM_FEATURE_DIRECTED_ROUNDING)
+    return (int64_t) vgetq_lane_f32(vrndiq_f32(vreinterpretq_f32_m128(a)), 0);
+#else
+    float32_t data = vgetq_lane_f32(
+        vreinterpretq_f32_m128(_mm_round_ps(a, _MM_FROUND_CUR_DIRECTION)), 0);
+    return (int64_t) data;
+#endif
+}
+
+// Convert packed single-precision (32-bit) floating-point elements in a to
+// packed 32-bit integers with truncation, and store the results in dst.
+// https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_cvtt_ps2pi
+FORCE_INLINE __m64 _mm_cvtt_ps2pi(__m128 a)
+{
+    return vreinterpret_m64_s32(
+        vget_low_s32(vcvtq_s32_f32(vreinterpretq_f32_m128(a))));
+}
+
+// Convert the lower single-precision (32-bit) floating-point element in a to a
+// 32-bit integer with truncation, and store the result in dst.
+// https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_cvtt_ss2si
+FORCE_INLINE int _mm_cvtt_ss2si(__m128 a)
+{
+    return vgetq_lane_s32(vcvtq_s32_f32(vreinterpretq_f32_m128(a)), 0);
+}
+
+// Convert packed single-precision (32-bit) floating-point elements in a to
+// packed 32-bit integers with truncation, and store the results in dst.
+// https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_cvttps_pi32
+#define _mm_cvttps_pi32(a) _mm_cvtt_ps2pi(a)
+
+// Convert the lower single-precision (32-bit) floating-point element in a to a
+// 32-bit integer with truncation, and store the result in dst.
+// https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_cvttss_si32
+#define _mm_cvttss_si32(a) _mm_cvtt_ss2si(a)
+
+// Convert the lower single-precision (32-bit) floating-point element in a to a
+// 64-bit integer with truncation, and store the result in dst.
+// https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_cvttss_si64
+FORCE_INLINE int64_t _mm_cvttss_si64(__m128 a)
+{
+    return (int64_t) vgetq_lane_f32(vreinterpretq_f32_m128(a), 0);
+}
+
+// Divide packed single-precision (32-bit) floating-point elements in a by
+// packed elements in b, and store the results in dst.
+// Due to ARMv7-A NEON's lack of a precise division intrinsic, we implement
+// division by multiplying a by b's reciprocal before using the Newton-Raphson
+// method to approximate the results.
+// https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_div_ps
+FORCE_INLINE __m128 _mm_div_ps(__m128 a, __m128 b)
+{
+#if defined(__aarch64__) || defined(_M_ARM64)
+    return vreinterpretq_m128_f32(
+        vdivq_f32(vreinterpretq_f32_m128(a), vreinterpretq_f32_m128(b)));
+#else
+    float32x4_t recip = vrecpeq_f32(vreinterpretq_f32_m128(b));
+    recip = vmulq_f32(recip, vrecpsq_f32(recip, vreinterpretq_f32_m128(b)));
+    // Additional Netwon-Raphson iteration for accuracy
+    recip = vmulq_f32(recip, vrecpsq_f32(recip, vreinterpretq_f32_m128(b)));
+    return vreinterpretq_m128_f32(vmulq_f32(vreinterpretq_f32_m128(a), recip));
+#endif
+}
+
+// Divide the lower single-precision (32-bit) floating-point element in a by the
+// lower single-precision (32-bit) floating-point element in b, store the result
+// in the lower element of dst, and copy the upper 3 packed elements from a to
+// the upper elements of dst.
+// Warning: ARMv7-A does not produce the same result compared to Intel and not
+// IEEE-compliant.
+// https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_div_ss
+FORCE_INLINE __m128 _mm_div_ss(__m128 a, __m128 b)
+{
+    float32_t value =
+        vgetq_lane_f32(vreinterpretq_f32_m128(_mm_div_ps(a, b)), 0);
+    return vreinterpretq_m128_f32(
+        vsetq_lane_f32(value, vreinterpretq_f32_m128(a), 0));
+}
+
+// Extract a 16-bit integer from a, selected with imm8, and store the result in
+// the lower element of dst.
+// https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_extract_pi16
+#define _mm_extract_pi16(a, imm) \
+    (int32_t) vget_lane_u16(vreinterpret_u16_m64(a), (imm))
+
+// Free aligned memory that was allocated with _mm_malloc.
+// https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_free
+#if !defined(SSE2NEON_ALLOC_DEFINED)
+FORCE_INLINE void _mm_free(void *addr)
+{
+    free(addr);
+}
+#endif
+
+FORCE_INLINE uint64_t _sse2neon_get_fpcr(void)
+{
+    uint64_t value;
+#if defined(_MSC_VER) && !defined(__clang__)
+    value = _ReadStatusReg(ARM64_FPCR);
+#else
+    __asm__ __volatile__("mrs %0, FPCR" : "=r"(value)); /* read */
+#endif
+    return value;
+}
+
+FORCE_INLINE void _sse2neon_set_fpcr(uint64_t value)
+{
+#if defined(_MSC_VER) && !defined(__clang__)
+    _WriteStatusReg(ARM64_FPCR, value);
+#else
+    __asm__ __volatile__("msr FPCR, %0" ::"r"(value));  /* write */
+#endif
+}
+
+// Macro: Get the flush zero bits from the MXCSR control and status register.
+// The flush zero may contain any of the following flags: _MM_FLUSH_ZERO_ON or
+// _MM_FLUSH_ZERO_OFF
+// https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_MM_GET_FLUSH_ZERO_MODE
+FORCE_INLINE unsigned int _sse2neon_mm_get_flush_zero_mode(void)
+{
+    union {
+        fpcr_bitfield field;
+#if defined(__aarch64__) || defined(_M_ARM64)
+        uint64_t value;
+#else
+        uint32_t value;
+#endif
+    } r;
+
+#if defined(__aarch64__) || defined(_M_ARM64)
+    r.value = _sse2neon_get_fpcr();
+#else
+    __asm__ __volatile__("vmrs %0, FPSCR" : "=r"(r.value)); /* read */
+#endif
+
+    return r.field.bit24 ? _MM_FLUSH_ZERO_ON : _MM_FLUSH_ZERO_OFF;
+}
+
+// Macro: Get the rounding mode bits from the MXCSR control and status register.
+// The rounding mode may contain any of the following flags: _MM_ROUND_NEAREST,
+// _MM_ROUND_DOWN, _MM_ROUND_UP, _MM_ROUND_TOWARD_ZERO
+// https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_MM_GET_ROUNDING_MODE
+FORCE_INLINE unsigned int _MM_GET_ROUNDING_MODE(void)
+{
+    switch (fegetround()) {
+    case FE_TONEAREST:
+        return _MM_ROUND_NEAREST;
+    case FE_DOWNWARD:
+        return _MM_ROUND_DOWN;
+    case FE_UPWARD:
+        return _MM_ROUND_UP;
+    case FE_TOWARDZERO:
+        return _MM_ROUND_TOWARD_ZERO;
+    default:
+        // fegetround() must return _MM_ROUND_NEAREST, _MM_ROUND_DOWN,
+        // _MM_ROUND_UP, _MM_ROUND_TOWARD_ZERO on success. all the other error
+        // cases we treat them as FE_TOWARDZERO (truncate).
+        return _MM_ROUND_TOWARD_ZERO;
+    }
+}
+
+// Copy a to dst, and insert the 16-bit integer i into dst at the location
+// specified by imm8.
+// https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_insert_pi16
+#define _mm_insert_pi16(a, b, imm) \
+    vreinterpret_m64_s16(vset_lane_s16((b), vreinterpret_s16_m64(a), (imm)))
+
+// Load 128-bits (composed of 4 packed single-precision (32-bit) floating-point
+// elements) from memory into dst. mem_addr must be aligned on a 16-byte
+// boundary or a general-protection exception may be generated.
+// https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_load_ps
+FORCE_INLINE __m128 _mm_load_ps(const float *p)
+{
+    return vreinterpretq_m128_f32(vld1q_f32(p));
+}
+
+// Load a single-precision (32-bit) floating-point element from memory into all
+// elements of dst.
+//
+//   dst[31:0] := MEM[mem_addr+31:mem_addr]
+//   dst[63:32] := MEM[mem_addr+31:mem_addr]
+//   dst[95:64] := MEM[mem_addr+31:mem_addr]
+//   dst[127:96] := MEM[mem_addr+31:mem_addr]
+//
+// https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_load_ps1
+#define _mm_load_ps1 _mm_load1_ps
+
+// Load a single-precision (32-bit) floating-point element from memory into the
+// lower of dst, and zero the upper 3 elements. mem_addr does not need to be
+// aligned on any particular boundary.
+// https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_load_ss
+FORCE_INLINE __m128 _mm_load_ss(const float *p)
+{
+    return vreinterpretq_m128_f32(vsetq_lane_f32(*p, vdupq_n_f32(0), 0));
+}
+
+// Load a single-precision (32-bit) floating-point element from memory into all
+// elements of dst.
+// https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_load1_ps
+FORCE_INLINE __m128 _mm_load1_ps(const float *p)
+{
+    return vreinterpretq_m128_f32(vld1q_dup_f32(p));
+}
+
+// Load 2 single-precision (32-bit) floating-point elements from memory into the
+// upper 2 elements of dst, and copy the lower 2 elements from a to dst.
+// mem_addr does not need to be aligned on any particular boundary.
+// https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_loadh_pi
+FORCE_INLINE __m128 _mm_loadh_pi(__m128 a, __m64 const *p)
+{
+    return vreinterpretq_m128_f32(
+        vcombine_f32(vget_low_f32(a), vld1_f32((const float32_t *) p)));
+}
+
+// Load 2 single-precision (32-bit) floating-point elements from memory into the
+// lower 2 elements of dst, and copy the upper 2 elements from a to dst.
+// mem_addr does not need to be aligned on any particular boundary.
+// https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_loadl_pi
+FORCE_INLINE __m128 _mm_loadl_pi(__m128 a, __m64 const *p)
+{
+    return vreinterpretq_m128_f32(
+        vcombine_f32(vld1_f32((const float32_t *) p), vget_high_f32(a)));
+}
+
+// Load 4 single-precision (32-bit) floating-point elements from memory into dst
+// in reverse order. mem_addr must be aligned on a 16-byte boundary or a
+// general-protection exception may be generated.
+// https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_loadr_ps
+FORCE_INLINE __m128 _mm_loadr_ps(const float *p)
+{
+    float32x4_t v = vrev64q_f32(vld1q_f32(p));
+    return vreinterpretq_m128_f32(vextq_f32(v, v, 2));
+}
+
+// Load 128-bits (composed of 4 packed single-precision (32-bit) floating-point
+// elements) from memory into dst. mem_addr does not need to be aligned on any
+// particular boundary.
+// https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_loadu_ps
+FORCE_INLINE __m128 _mm_loadu_ps(const float *p)
+{
+    // for neon, alignment doesn't matter, so _mm_load_ps and _mm_loadu_ps are
+    // equivalent for neon
+    return vreinterpretq_m128_f32(vld1q_f32(p));
+}
+
+// Load unaligned 16-bit integer from memory into the first element of dst.
+// https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_loadu_si16
+FORCE_INLINE __m128i _mm_loadu_si16(const void *p)
+{
+    return vreinterpretq_m128i_s16(
+        vsetq_lane_s16(*(const unaligned_int16_t *) p, vdupq_n_s16(0), 0));
+}
+
+// Load unaligned 64-bit integer from memory into the first element of dst.
+// https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_loadu_si64
+FORCE_INLINE __m128i _mm_loadu_si64(const void *p)
+{
+    return vreinterpretq_m128i_s64(
+        vsetq_lane_s64(*(const unaligned_int64_t *) p, vdupq_n_s64(0), 0));
+}
+
+// Allocate size bytes of memory, aligned to the alignment specified in align,
+// and return a pointer to the allocated memory. _mm_free should be used to free
+// memory that is allocated with _mm_malloc.
+// https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_malloc
+#if !defined(SSE2NEON_ALLOC_DEFINED)
+FORCE_INLINE void *_mm_malloc(size_t size, size_t align)
+{
+    void *ptr;
+    if (align == 1)
+        return malloc(size);
+    if (align == 2 || (sizeof(void *) == 8 && align == 4))
+        align = sizeof(void *);
+    if (!posix_memalign(&ptr, align, size))
+        return ptr;
+    return NULL;
+}
+#endif
+
+// Conditionally store 8-bit integer elements from a into memory using mask
+// (elements are not stored when the highest bit is not set in the corresponding
+// element) and a non-temporal memory hint.
+// https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_maskmove_si64
+FORCE_INLINE void _mm_maskmove_si64(__m64 a, __m64 mask, char *mem_addr)
+{
+    int8x8_t shr_mask = vshr_n_s8(vreinterpret_s8_m64(mask), 7);
+    __m128 b = _mm_load_ps((const float *) mem_addr);
+    int8x8_t masked =
+        vbsl_s8(vreinterpret_u8_s8(shr_mask), vreinterpret_s8_m64(a),
+                vreinterpret_s8_u64(vget_low_u64(vreinterpretq_u64_m128(b))));
+    vst1_s8((int8_t *) mem_addr, masked);
+}
+
+// Conditionally store 8-bit integer elements from a into memory using mask
+// (elements are not stored when the highest bit is not set in the corresponding
+// element) and a non-temporal memory hint.
+// https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_m_maskmovq
+#define _m_maskmovq(a, mask, mem_addr) _mm_maskmove_si64(a, mask, mem_addr)
+
+// Compare packed signed 16-bit integers in a and b, and store packed maximum
+// values in dst.
+// https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_max_pi16
+FORCE_INLINE __m64 _mm_max_pi16(__m64 a, __m64 b)
+{
+    return vreinterpret_m64_s16(
+        vmax_s16(vreinterpret_s16_m64(a), vreinterpret_s16_m64(b)));
+}
+
+// Compare packed single-precision (32-bit) floating-point elements in a and b,
+// and store packed maximum values in dst. dst does not follow the IEEE Standard
+// for Floating-Point Arithmetic (IEEE 754) maximum value when inputs are NaN or
+// signed-zero values.
+// https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_max_ps
+FORCE_INLINE __m128 _mm_max_ps(__m128 a, __m128 b)
+{
+#if SSE2NEON_PRECISE_MINMAX
+    float32x4_t _a = vreinterpretq_f32_m128(a);
+    float32x4_t _b = vreinterpretq_f32_m128(b);
+    return vreinterpretq_m128_f32(vbslq_f32(vcgtq_f32(_a, _b), _a, _b));
+#else
+    return vreinterpretq_m128_f32(
+        vmaxq_f32(vreinterpretq_f32_m128(a), vreinterpretq_f32_m128(b)));
+#endif
+}
+
+// Compare packed unsigned 8-bit integers in a and b, and store packed maximum
+// values in dst.
+// https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_max_pu8
+FORCE_INLINE __m64 _mm_max_pu8(__m64 a, __m64 b)
+{
+    return vreinterpret_m64_u8(
+        vmax_u8(vreinterpret_u8_m64(a), vreinterpret_u8_m64(b)));
+}
+
+// Compare the lower single-precision (32-bit) floating-point elements in a and
+// b, store the maximum value in the lower element of dst, and copy the upper 3
+// packed elements from a to the upper element of dst. dst does not follow the
+// IEEE Standard for Floating-Point Arithmetic (IEEE 754) maximum value when
+// inputs are NaN or signed-zero values.
+// https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_max_ss
+FORCE_INLINE __m128 _mm_max_ss(__m128 a, __m128 b)
+{
+    float32_t value = vgetq_lane_f32(_mm_max_ps(a, b), 0);
+    return vreinterpretq_m128_f32(
+        vsetq_lane_f32(value, vreinterpretq_f32_m128(a), 0));
+}
+
+// Compare packed signed 16-bit integers in a and b, and store packed minimum
+// values in dst.
+// https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_min_pi16
+FORCE_INLINE __m64 _mm_min_pi16(__m64 a, __m64 b)
+{
+    return vreinterpret_m64_s16(
+        vmin_s16(vreinterpret_s16_m64(a), vreinterpret_s16_m64(b)));
+}
+
+// Compare packed single-precision (32-bit) floating-point elements in a and b,
+// and store packed minimum values in dst. dst does not follow the IEEE Standard
+// for Floating-Point Arithmetic (IEEE 754) minimum value when inputs are NaN or
+// signed-zero values.
+// https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_min_ps
+FORCE_INLINE __m128 _mm_min_ps(__m128 a, __m128 b)
+{
+#if SSE2NEON_PRECISE_MINMAX
+    float32x4_t _a = vreinterpretq_f32_m128(a);
+    float32x4_t _b = vreinterpretq_f32_m128(b);
+    return vreinterpretq_m128_f32(vbslq_f32(vcltq_f32(_a, _b), _a, _b));
+#else
+    return vreinterpretq_m128_f32(
+        vminq_f32(vreinterpretq_f32_m128(a), vreinterpretq_f32_m128(b)));
+#endif
+}
+
+// Compare packed unsigned 8-bit integers in a and b, and store packed minimum
+// values in dst.
+// https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_min_pu8
+FORCE_INLINE __m64 _mm_min_pu8(__m64 a, __m64 b)
+{
+    return vreinterpret_m64_u8(
+        vmin_u8(vreinterpret_u8_m64(a), vreinterpret_u8_m64(b)));
+}
+
+// Compare the lower single-precision (32-bit) floating-point elements in a and
+// b, store the minimum value in the lower element of dst, and copy the upper 3
+// packed elements from a to the upper element of dst. dst does not follow the
+// IEEE Standard for Floating-Point Arithmetic (IEEE 754) minimum value when
+// inputs are NaN or signed-zero values.
+// https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_min_ss
+FORCE_INLINE __m128 _mm_min_ss(__m128 a, __m128 b)
+{
+    float32_t value = vgetq_lane_f32(_mm_min_ps(a, b), 0);
+    return vreinterpretq_m128_f32(
+        vsetq_lane_f32(value, vreinterpretq_f32_m128(a), 0));
+}
+
+// Move the lower single-precision (32-bit) floating-point element from b to the
+// lower element of dst, and copy the upper 3 packed elements from a to the
+// upper elements of dst.
+// https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_move_ss
+FORCE_INLINE __m128 _mm_move_ss(__m128 a, __m128 b)
+{
+    return vreinterpretq_m128_f32(
+        vsetq_lane_f32(vgetq_lane_f32(vreinterpretq_f32_m128(b), 0),
+                       vreinterpretq_f32_m128(a), 0));
+}
+
+// Move the upper 2 single-precision (32-bit) floating-point elements from b to
+// the lower 2 elements of dst, and copy the upper 2 elements from a to the
+// upper 2 elements of dst.
+// https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_movehl_ps
+FORCE_INLINE __m128 _mm_movehl_ps(__m128 a, __m128 b)
+{
+#if defined(aarch64__)
+    return vreinterpretq_m128_u64(
+        vzip2q_u64(vreinterpretq_u64_m128(b), vreinterpretq_u64_m128(a)));
+#else
+    float32x2_t a32 = vget_high_f32(vreinterpretq_f32_m128(a));
+    float32x2_t b32 = vget_high_f32(vreinterpretq_f32_m128(b));
+    return vreinterpretq_m128_f32(vcombine_f32(b32, a32));
+#endif
+}
+
+// Move the lower 2 single-precision (32-bit) floating-point elements from b to
+// the upper 2 elements of dst, and copy the lower 2 elements from a to the
+// lower 2 elements of dst.
+// https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_movelh_ps
+FORCE_INLINE __m128 _mm_movelh_ps(__m128 __A, __m128 __B)
+{
+    float32x2_t a10 = vget_low_f32(vreinterpretq_f32_m128(__A));
+    float32x2_t b10 = vget_low_f32(vreinterpretq_f32_m128(__B));
+    return vreinterpretq_m128_f32(vcombine_f32(a10, b10));
+}
+
+// Create mask from the most significant bit of each 8-bit element in a, and
+// store the result in dst.
+// https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_movemask_pi8
+FORCE_INLINE int _mm_movemask_pi8(__m64 a)
+{
+    uint8x8_t input = vreinterpret_u8_m64(a);
+#if defined(__aarch64__) || defined(_M_ARM64)
+    static const int8_t shift[8] = {0, 1, 2, 3, 4, 5, 6, 7};
+    uint8x8_t tmp = vshr_n_u8(input, 7);
+    return vaddv_u8(vshl_u8(tmp, vld1_s8(shift)));
+#else
+    // Refer the implementation of `_mm_movemask_epi8`
+    uint16x4_t high_bits = vreinterpret_u16_u8(vshr_n_u8(input, 7));
+    uint32x2_t paired16 =
+        vreinterpret_u32_u16(vsra_n_u16(high_bits, high_bits, 7));
+    uint8x8_t paired32 =
+        vreinterpret_u8_u32(vsra_n_u32(paired16, paired16, 14));
+    return vget_lane_u8(paired32, 0) | ((int) vget_lane_u8(paired32, 4) << 4);
+#endif
+}
+
+// Set each bit of mask dst based on the most significant bit of the
+// corresponding packed single-precision (32-bit) floating-point element in a.
+// https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_movemask_ps
+FORCE_INLINE int _mm_movemask_ps(__m128 a)
+{
+    uint32x4_t input = vreinterpretq_u32_m128(a);
+#if defined(__aarch64__) || defined(_M_ARM64)
+    static const int32_t shift[4] = {0, 1, 2, 3};
+    uint32x4_t tmp = vshrq_n_u32(input, 31);
+    return vaddvq_u32(vshlq_u32(tmp, vld1q_s32(shift)));
+#else
+    // Uses the exact same method as _mm_movemask_epi8, see that for details.
+    // Shift out everything but the sign bits with a 32-bit unsigned shift
+    // right.
+    uint64x2_t high_bits = vreinterpretq_u64_u32(vshrq_n_u32(input, 31));
+    // Merge the two pairs together with a 64-bit unsigned shift right + add.
+    uint8x16_t paired =
+        vreinterpretq_u8_u64(vsraq_n_u64(high_bits, high_bits, 31));
+    // Extract the result.
+    return vgetq_lane_u8(paired, 0) | (vgetq_lane_u8(paired, 8) << 2);
+#endif
+}
+
+// Multiply packed single-precision (32-bit) floating-point elements in a and b,
+// and store the results in dst.
+// https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_mul_ps
+FORCE_INLINE __m128 _mm_mul_ps(__m128 a, __m128 b)
+{
+    return vreinterpretq_m128_f32(
+        vmulq_f32(vreinterpretq_f32_m128(a), vreinterpretq_f32_m128(b)));
+}
+
+// Multiply the lower single-precision (32-bit) floating-point element in a and
+// b, store the result in the lower element of dst, and copy the upper 3 packed
+// elements from a to the upper elements of dst.
+// https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_mul_ss
+FORCE_INLINE __m128 _mm_mul_ss(__m128 a, __m128 b)
+{
+    return _mm_move_ss(a, _mm_mul_ps(a, b));
+}
+
+// Multiply the packed unsigned 16-bit integers in a and b, producing
+// intermediate 32-bit integers, and store the high 16 bits of the intermediate
+// integers in dst.
+// https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_mulhi_pu16
+FORCE_INLINE __m64 _mm_mulhi_pu16(__m64 a, __m64 b)
+{
+    return vreinterpret_m64_u16(vshrn_n_u32(
+        vmull_u16(vreinterpret_u16_m64(a), vreinterpret_u16_m64(b)), 16));
+}
+
+// Compute the bitwise OR of packed single-precision (32-bit) floating-point
+// elements in a and b, and store the results in dst.
+// https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_or_ps
+FORCE_INLINE __m128 _mm_or_ps(__m128 a, __m128 b)
+{
+    return vreinterpretq_m128_s32(
+        vorrq_s32(vreinterpretq_s32_m128(a), vreinterpretq_s32_m128(b)));
+}
+
+// Average packed unsigned 8-bit integers in a and b, and store the results in
+// dst.
+// https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_m_pavgb
+#define _m_pavgb(a, b) _mm_avg_pu8(a, b)
+
+// Average packed unsigned 16-bit integers in a and b, and store the results in
+// dst.
+// https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_m_pavgw
+#define _m_pavgw(a, b) _mm_avg_pu16(a, b)
+
+// Extract a 16-bit integer from a, selected with imm8, and store the result in
+// the lower element of dst.
+// https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_m_pextrw
+#define _m_pextrw(a, imm) _mm_extract_pi16(a, imm)
+
+// Copy a to dst, and insert the 16-bit integer i into dst at the location
+// specified by imm8.
+// https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=m_pinsrw
+#define _m_pinsrw(a, i, imm) _mm_insert_pi16(a, i, imm)
+
+// Compare packed signed 16-bit integers in a and b, and store packed maximum
+// values in dst.
+// https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_m_pmaxsw
+#define _m_pmaxsw(a, b) _mm_max_pi16(a, b)
+
+// Compare packed unsigned 8-bit integers in a and b, and store packed maximum
+// values in dst.
+// https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_m_pmaxub
+#define _m_pmaxub(a, b) _mm_max_pu8(a, b)
+
+// Compare packed signed 16-bit integers in a and b, and store packed minimum
+// values in dst.
+// https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_m_pminsw
+#define _m_pminsw(a, b) _mm_min_pi16(a, b)
+
+// Compare packed unsigned 8-bit integers in a and b, and store packed minimum
+// values in dst.
+// https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_m_pminub
+#define _m_pminub(a, b) _mm_min_pu8(a, b)
+
+// Create mask from the most significant bit of each 8-bit element in a, and
+// store the result in dst.
+// https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_m_pmovmskb
+#define _m_pmovmskb(a) _mm_movemask_pi8(a)
+
+// Multiply the packed unsigned 16-bit integers in a and b, producing
+// intermediate 32-bit integers, and store the high 16 bits of the intermediate
+// integers in dst.
+// https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_m_pmulhuw
+#define _m_pmulhuw(a, b) _mm_mulhi_pu16(a, b)
+
+// Fetch the line of data from memory that contains address p to a location in
+// the cache hierarchy specified by the locality hint i.
+// https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_prefetch
+FORCE_INLINE void _mm_prefetch(char const *p, int i)
+{
+    (void) i;
+#if defined(_MSC_VER) && !defined(__clang__)
+    switch (i) {
+    case _MM_HINT_NTA:
+        __prefetch2(p, 1);
+        break;
+    case _MM_HINT_T0:
+        __prefetch2(p, 0);
+        break;
+    case _MM_HINT_T1:
+        __prefetch2(p, 2);
+        break;
+    case _MM_HINT_T2:
+        __prefetch2(p, 4);
+        break;
+    }
+#else
+    switch (i) {
+    case _MM_HINT_NTA:
+        __builtin_prefetch(p, 0, 0);
+        break;
+    case _MM_HINT_T0:
+        __builtin_prefetch(p, 0, 3);
+        break;
+    case _MM_HINT_T1:
+        __builtin_prefetch(p, 0, 2);
+        break;
+    case _MM_HINT_T2:
+        __builtin_prefetch(p, 0, 1);
+        break;
+    }
+#endif
+}
+
+// Compute the absolute differences of packed unsigned 8-bit integers in a and
+// b, then horizontally sum each consecutive 8 differences to produce four
+// unsigned 16-bit integers, and pack these unsigned 16-bit integers in the low
+// 16 bits of dst.
+// https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=m_psadbw
+#define _m_psadbw(a, b) _mm_sad_pu8(a, b)
+
+// Shuffle 16-bit integers in a using the control in imm8, and store the results
+// in dst.
+// https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_m_pshufw
+#define _m_pshufw(a, imm) _mm_shuffle_pi16(a, imm)
+
+// Compute the approximate reciprocal of packed single-precision (32-bit)
+// floating-point elements in a, and store the results in dst. The maximum
+// relative error for this approximation is less than 1.5*2^-12.
+// https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_rcp_ps
+FORCE_INLINE __m128 _mm_rcp_ps(__m128 in)
+{
+    float32x4_t recip = vrecpeq_f32(vreinterpretq_f32_m128(in));
+    recip = vmulq_f32(recip, vrecpsq_f32(recip, vreinterpretq_f32_m128(in)));
+#if SSE2NEON_PRECISE_DIV
+    // Additional Netwon-Raphson iteration for accuracy
+    recip = vmulq_f32(recip, vrecpsq_f32(recip, vreinterpretq_f32_m128(in)));
+#endif
+    return vreinterpretq_m128_f32(recip);
+}
+
+// Compute the approximate reciprocal of the lower single-precision (32-bit)
+// floating-point element in a, store the result in the lower element of dst,
+// and copy the upper 3 packed elements from a to the upper elements of dst. The
+// maximum relative error for this approximation is less than 1.5*2^-12.
+// https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_rcp_ss
+FORCE_INLINE __m128 _mm_rcp_ss(__m128 a)
+{
+    return _mm_move_ss(a, _mm_rcp_ps(a));
+}
+
+// Compute the approximate reciprocal square root of packed single-precision
+// (32-bit) floating-point elements in a, and store the results in dst. The
+// maximum relative error for this approximation is less than 1.5*2^-12.
+// https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_rsqrt_ps
+FORCE_INLINE __m128 _mm_rsqrt_ps(__m128 in)
+{
+    float32x4_t out = vrsqrteq_f32(vreinterpretq_f32_m128(in));
+
+    // Generate masks for detecting whether input has any 0.0f/-0.0f
+    // (which becomes positive/negative infinity by IEEE-754 arithmetic rules).
+    const uint32x4_t pos_inf = vdupq_n_u32(0x7F800000);
+    const uint32x4_t neg_inf = vdupq_n_u32(0xFF800000);
+    const uint32x4_t has_pos_zero =
+        vceqq_u32(pos_inf, vreinterpretq_u32_f32(out));
+    const uint32x4_t has_neg_zero =
+        vceqq_u32(neg_inf, vreinterpretq_u32_f32(out));
+
+    out = vmulq_f32(
+        out, vrsqrtsq_f32(vmulq_f32(vreinterpretq_f32_m128(in), out), out));
+#if SSE2NEON_PRECISE_SQRT
+    // Additional Netwon-Raphson iteration for accuracy
+    out = vmulq_f32(
+        out, vrsqrtsq_f32(vmulq_f32(vreinterpretq_f32_m128(in), out), out));
+#endif
+
+    // Set output vector element to infinity/negative-infinity if
+    // the corresponding input vector element is 0.0f/-0.0f.
+    out = vbslq_f32(has_pos_zero, (float32x4_t) pos_inf, out);
+    out = vbslq_f32(has_neg_zero, (float32x4_t) neg_inf, out);
+
+    return vreinterpretq_m128_f32(out);
+}
+
+// Compute the approximate reciprocal square root of the lower single-precision
+// (32-bit) floating-point element in a, store the result in the lower element
+// of dst, and copy the upper 3 packed elements from a to the upper elements of
+// dst.
+// https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_rsqrt_ss
+FORCE_INLINE __m128 _mm_rsqrt_ss(__m128 in)
+{
+    return vsetq_lane_f32(vgetq_lane_f32(_mm_rsqrt_ps(in), 0), in, 0);
+}
+
+// Compute the absolute differences of packed unsigned 8-bit integers in a and
+// b, then horizontally sum each consecutive 8 differences to produce four
+// unsigned 16-bit integers, and pack these unsigned 16-bit integers in the low
+// 16 bits of dst.
+// https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_sad_pu8
+FORCE_INLINE __m64 _mm_sad_pu8(__m64 a, __m64 b)
+{
+    uint64x1_t t = vpaddl_u32(vpaddl_u16(
+        vpaddl_u8(vabd_u8(vreinterpret_u8_m64(a), vreinterpret_u8_m64(b)))));
+    return vreinterpret_m64_u16(
+        vset_lane_u16((int) vget_lane_u64(t, 0), vdup_n_u16(0), 0));
+}
+
+// Macro: Set the flush zero bits of the MXCSR control and status register to
+// the value in unsigned 32-bit integer a. The flush zero may contain any of the
+// following flags: _MM_FLUSH_ZERO_ON or _MM_FLUSH_ZERO_OFF
+// https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_MM_SET_FLUSH_ZERO_MODE
+FORCE_INLINE void _sse2neon_mm_set_flush_zero_mode(unsigned int flag)
+{
+    // AArch32 Advanced SIMD arithmetic always uses the Flush-to-zero setting,
+    // regardless of the value of the FZ bit.
+    union {
+        fpcr_bitfield field;
+#if defined(__aarch64__) || defined(_M_ARM64)
+        uint64_t value;
+#else
+        uint32_t value;
+#endif
+    } r;
+
+#if defined(__aarch64__) || defined(_M_ARM64)
+    r.value = _sse2neon_get_fpcr();
+#else
+    __asm__ __volatile__("vmrs %0, FPSCR" : "=r"(r.value)); /* read */
+#endif
+
+    r.field.bit24 = (flag & _MM_FLUSH_ZERO_MASK) == _MM_FLUSH_ZERO_ON;
+
+#if defined(__aarch64__) || defined(_M_ARM64)
+    _sse2neon_set_fpcr(r.value);
+#else
+    __asm__ __volatile__("vmsr FPSCR, %0" ::"r"(r));        /* write */
+#endif
+}
+
+// Set packed single-precision (32-bit) floating-point elements in dst with the
+// supplied values.
+// https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_set_ps
+FORCE_INLINE __m128 _mm_set_ps(float w, float z, float y, float x)
+{
+    float ALIGN_STRUCT(16) data[4] = {x, y, z, w};
+    return vreinterpretq_m128_f32(vld1q_f32(data));
+}
+
+// Broadcast single-precision (32-bit) floating-point value a to all elements of
+// dst.
+// https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_set_ps1
+FORCE_INLINE __m128 _mm_set_ps1(float _w)
+{
+    return vreinterpretq_m128_f32(vdupq_n_f32(_w));
+}
+
+// Macro: Set the rounding mode bits of the MXCSR control and status register to
+// the value in unsigned 32-bit integer a. The rounding mode may contain any of
+// the following flags: _MM_ROUND_NEAREST, _MM_ROUND_DOWN, _MM_ROUND_UP,
+// _MM_ROUND_TOWARD_ZERO
+// https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_MM_SET_ROUNDING_MODE
+FORCE_INLINE void _MM_SET_ROUNDING_MODE(int rounding)
+{
+    switch (rounding) {
+    case _MM_ROUND_NEAREST:
+        rounding = FE_TONEAREST;
+        break;
+    case _MM_ROUND_DOWN:
+        rounding = FE_DOWNWARD;
+        break;
+    case _MM_ROUND_UP:
+        rounding = FE_UPWARD;
+        break;
+    case _MM_ROUND_TOWARD_ZERO:
+        rounding = FE_TOWARDZERO;
+        break;
+    default:
+        // rounding must be _MM_ROUND_NEAREST, _MM_ROUND_DOWN, _MM_ROUND_UP,
+        // _MM_ROUND_TOWARD_ZERO. all the other invalid values we treat them as
+        // FE_TOWARDZERO (truncate).
+        rounding = FE_TOWARDZERO;
+    }
+    fesetround(rounding);
+}
+
+// Copy single-precision (32-bit) floating-point element a to the lower element
+// of dst, and zero the upper 3 elements.
+// https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_set_ss
+FORCE_INLINE __m128 _mm_set_ss(float a)
+{
+    return vreinterpretq_m128_f32(vsetq_lane_f32(a, vdupq_n_f32(0), 0));
+}
+
+// Broadcast single-precision (32-bit) floating-point value a to all elements of
+// dst.
+// https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_set1_ps
+FORCE_INLINE __m128 _mm_set1_ps(float _w)
+{
+    return vreinterpretq_m128_f32(vdupq_n_f32(_w));
+}
+
+// Set the MXCSR control and status register with the value in unsigned 32-bit
+// integer a.
+// https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_setcsr
+// FIXME: _mm_setcsr() implementation supports changing the rounding mode only.
+FORCE_INLINE void _mm_setcsr(unsigned int a)
+{
+    _MM_SET_ROUNDING_MODE(a);
+}
+
+// Get the unsigned 32-bit value of the MXCSR control and status register.
+// https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_getcsr
+// FIXME: _mm_getcsr() implementation supports reading the rounding mode only.
+FORCE_INLINE unsigned int _mm_getcsr(void)
+{
+    return _MM_GET_ROUNDING_MODE();
+}
+
+// Set packed single-precision (32-bit) floating-point elements in dst with the
+// supplied values in reverse order.
+// https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_setr_ps
+FORCE_INLINE __m128 _mm_setr_ps(float w, float z, float y, float x)
+{
+    float ALIGN_STRUCT(16) data[4] = {w, z, y, x};
+    return vreinterpretq_m128_f32(vld1q_f32(data));
+}
+
+// Return vector of type __m128 with all elements set to zero.
+// https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_setzero_ps
+FORCE_INLINE __m128 _mm_setzero_ps(void)
+{
+    return vreinterpretq_m128_f32(vdupq_n_f32(0));
+}
+
+// Shuffle 16-bit integers in a using the control in imm8, and store the results
+// in dst.
+// https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_shuffle_pi16
+#ifdef _sse2neon_shuffle
+#define _mm_shuffle_pi16(a, imm)                                       \
+    vreinterpret_m64_s16(vshuffle_s16(                                 \
+        vreinterpret_s16_m64(a), vreinterpret_s16_m64(a), (imm & 0x3), \
+        ((imm >> 2) & 0x3), ((imm >> 4) & 0x3), ((imm >> 6) & 0x3)))
+#else
+#define _mm_shuffle_pi16(a, imm)                                              \
+    _sse2neon_define1(                                                        \
+        __m64, a, int16x4_t ret;                                              \
+        ret = vmov_n_s16(                                                     \
+            vget_lane_s16(vreinterpret_s16_m64(_a), (imm) & (0x3)));          \
+        ret = vset_lane_s16(                                                  \
+            vget_lane_s16(vreinterpret_s16_m64(_a), ((imm) >> 2) & 0x3), ret, \
+            1);                                                               \
+        ret = vset_lane_s16(                                                  \
+            vget_lane_s16(vreinterpret_s16_m64(_a), ((imm) >> 4) & 0x3), ret, \
+            2);                                                               \
+        ret = vset_lane_s16(                                                  \
+            vget_lane_s16(vreinterpret_s16_m64(_a), ((imm) >> 6) & 0x3), ret, \
+            3);                                                               \
+        _sse2neon_return(vreinterpret_m64_s16(ret));)
+#endif
+
+// Perform a serializing operation on all store-to-memory instructions that were
+// issued prior to this instruction. Guarantees that every store instruction
+// that precedes, in program order, is globally visible before any store
+// instruction which follows the fence in program order.
+// https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_sfence
+FORCE_INLINE void _mm_sfence(void)
+{
+    _sse2neon_smp_mb();
+}
+
+// Perform a serializing operation on all load-from-memory and store-to-memory
+// instructions that were issued prior to this instruction. Guarantees that
+// every memory access that precedes, in program order, the memory fence
+// instruction is globally visible before any memory instruction which follows
+// the fence in program order.
+// https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_mfence
+FORCE_INLINE void _mm_mfence(void)
+{
+    _sse2neon_smp_mb();
+}
+
+// Perform a serializing operation on all load-from-memory instructions that
+// were issued prior to this instruction. Guarantees that every load instruction
+// that precedes, in program order, is globally visible before any load
+// instruction which follows the fence in program order.
+// https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_lfence
+FORCE_INLINE void _mm_lfence(void)
+{
+    _sse2neon_smp_mb();
+}
+
+// FORCE_INLINE __m128 _mm_shuffle_ps(__m128 a, __m128 b, __constrange(0,255)
+// int imm)
+#ifdef _sse2neon_shuffle
+#define _mm_shuffle_ps(a, b, imm)                                              \
+    __extension__({                                                            \
+        float32x4_t _input1 = vreinterpretq_f32_m128(a);                       \
+        float32x4_t _input2 = vreinterpretq_f32_m128(b);                       \
+        float32x4_t _shuf =                                                    \
+            vshuffleq_s32(_input1, _input2, (imm) & (0x3), ((imm) >> 2) & 0x3, \
+                          (((imm) >> 4) & 0x3) + 4, (((imm) >> 6) & 0x3) + 4); \
+        vreinterpretq_m128_f32(_shuf);                                         \
+    })
+#else  // generic
+#define _mm_shuffle_ps(a, b, imm)                            \
+    _sse2neon_define2(                                       \
+        __m128, a, b, __m128 ret; switch (imm) {             \
+            case _MM_SHUFFLE(1, 0, 3, 2):                    \
+                ret = _mm_shuffle_ps_1032(_a, _b);           \
+                break;                                       \
+            case _MM_SHUFFLE(2, 3, 0, 1):                    \
+                ret = _mm_shuffle_ps_2301(_a, _b);           \
+                break;                                       \
+            case _MM_SHUFFLE(0, 3, 2, 1):                    \
+                ret = _mm_shuffle_ps_0321(_a, _b);           \
+                break;                                       \
+            case _MM_SHUFFLE(2, 1, 0, 3):                    \
+                ret = _mm_shuffle_ps_2103(_a, _b);           \
+                break;                                       \
+            case _MM_SHUFFLE(1, 0, 1, 0):                    \
+                ret = _mm_movelh_ps(_a, _b);                 \
+                break;                                       \
+            case _MM_SHUFFLE(1, 0, 0, 1):                    \
+                ret = _mm_shuffle_ps_1001(_a, _b);           \
+                break;                                       \
+            case _MM_SHUFFLE(0, 1, 0, 1):                    \
+                ret = _mm_shuffle_ps_0101(_a, _b);           \
+                break;                                       \
+            case _MM_SHUFFLE(3, 2, 1, 0):                    \
+                ret = _mm_shuffle_ps_3210(_a, _b);           \
+                break;                                       \
+            case _MM_SHUFFLE(0, 0, 1, 1):                    \
+                ret = _mm_shuffle_ps_0011(_a, _b);           \
+                break;                                       \
+            case _MM_SHUFFLE(0, 0, 2, 2):                    \
+                ret = _mm_shuffle_ps_0022(_a, _b);           \
+                break;                                       \
+            case _MM_SHUFFLE(2, 2, 0, 0):                    \
+                ret = _mm_shuffle_ps_2200(_a, _b);           \
+                break;                                       \
+            case _MM_SHUFFLE(3, 2, 0, 2):                    \
+                ret = _mm_shuffle_ps_3202(_a, _b);           \
+                break;                                       \
+            case _MM_SHUFFLE(3, 2, 3, 2):                    \
+                ret = _mm_movehl_ps(_b, _a);                 \
+                break;                                       \
+            case _MM_SHUFFLE(1, 1, 3, 3):                    \
+                ret = _mm_shuffle_ps_1133(_a, _b);           \
+                break;                                       \
+            case _MM_SHUFFLE(2, 0, 1, 0):                    \
+                ret = _mm_shuffle_ps_2010(_a, _b);           \
+                break;                                       \
+            case _MM_SHUFFLE(2, 0, 0, 1):                    \
+                ret = _mm_shuffle_ps_2001(_a, _b);           \
+                break;                                       \
+            case _MM_SHUFFLE(2, 0, 3, 2):                    \
+                ret = _mm_shuffle_ps_2032(_a, _b);           \
+                break;                                       \
+            default:                                         \
+                ret = _mm_shuffle_ps_default(_a, _b, (imm)); \
+                break;                                       \
+        } _sse2neon_return(ret);)
+#endif
+
+// Compute the square root of packed single-precision (32-bit) floating-point
+// elements in a, and store the results in dst.
+// Due to ARMv7-A NEON's lack of a precise square root intrinsic, we implement
+// square root by multiplying input in with its reciprocal square root before
+// using the Newton-Raphson method to approximate the results.
+// https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_sqrt_ps
+FORCE_INLINE __m128 _mm_sqrt_ps(__m128 in)
+{
+#if (defined(__aarch64__) || defined(_M_ARM64)) && !SSE2NEON_PRECISE_SQRT
+    return vreinterpretq_m128_f32(vsqrtq_f32(vreinterpretq_f32_m128(in)));
+#else
+    float32x4_t recip = vrsqrteq_f32(vreinterpretq_f32_m128(in));
+
+    // Test for vrsqrteq_f32(0) -> positive infinity case.
+    // Change to zero, so that s * 1/sqrt(s) result is zero too.
+    const uint32x4_t pos_inf = vdupq_n_u32(0x7F800000);
+    const uint32x4_t div_by_zero =
+        vceqq_u32(pos_inf, vreinterpretq_u32_f32(recip));
+    recip = vreinterpretq_f32_u32(
+        vandq_u32(vmvnq_u32(div_by_zero), vreinterpretq_u32_f32(recip)));
+
+    recip = vmulq_f32(
+        vrsqrtsq_f32(vmulq_f32(recip, recip), vreinterpretq_f32_m128(in)),
+        recip);
+    // Additional Netwon-Raphson iteration for accuracy
+    recip = vmulq_f32(
+        vrsqrtsq_f32(vmulq_f32(recip, recip), vreinterpretq_f32_m128(in)),
+        recip);
+
+    // sqrt(s) = s * 1/sqrt(s)
+    return vreinterpretq_m128_f32(vmulq_f32(vreinterpretq_f32_m128(in), recip));
+#endif
+}
+
+// Compute the square root of the lower single-precision (32-bit) floating-point
+// element in a, store the result in the lower element of dst, and copy the
+// upper 3 packed elements from a to the upper elements of dst.
+// https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_sqrt_ss
+FORCE_INLINE __m128 _mm_sqrt_ss(__m128 in)
+{
+    float32_t value =
+        vgetq_lane_f32(vreinterpretq_f32_m128(_mm_sqrt_ps(in)), 0);
+    return vreinterpretq_m128_f32(
+        vsetq_lane_f32(value, vreinterpretq_f32_m128(in), 0));
+}
+
+// Store 128-bits (composed of 4 packed single-precision (32-bit) floating-point
+// elements) from a into memory. mem_addr must be aligned on a 16-byte boundary
+// or a general-protection exception may be generated.
+// https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_store_ps
+FORCE_INLINE void _mm_store_ps(float *p, __m128 a)
+{
+    vst1q_f32(p, vreinterpretq_f32_m128(a));
+}
+
+// Store the lower single-precision (32-bit) floating-point element from a into
+// 4 contiguous elements in memory. mem_addr must be aligned on a 16-byte
+// boundary or a general-protection exception may be generated.
+// https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_store_ps1
+FORCE_INLINE void _mm_store_ps1(float *p, __m128 a)
+{
+    float32_t a0 = vgetq_lane_f32(vreinterpretq_f32_m128(a), 0);
+    vst1q_f32(p, vdupq_n_f32(a0));
+}
+
+// Store the lower single-precision (32-bit) floating-point element from a into
+// memory. mem_addr does not need to be aligned on any particular boundary.
+// https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_store_ss
+FORCE_INLINE void _mm_store_ss(float *p, __m128 a)
+{
+    vst1q_lane_f32(p, vreinterpretq_f32_m128(a), 0);
+}
+
+// Store the lower single-precision (32-bit) floating-point element from a into
+// 4 contiguous elements in memory. mem_addr must be aligned on a 16-byte
+// boundary or a general-protection exception may be generated.
+// https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_store1_ps
+#define _mm_store1_ps _mm_store_ps1
+
+// Store the upper 2 single-precision (32-bit) floating-point elements from a
+// into memory.
+// https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_storeh_pi
+FORCE_INLINE void _mm_storeh_pi(__m64 *p, __m128 a)
+{
+    *p = vreinterpret_m64_f32(vget_high_f32(a));
+}
+
+// Store the lower 2 single-precision (32-bit) floating-point elements from a
+// into memory.
+// https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_storel_pi
+FORCE_INLINE void _mm_storel_pi(__m64 *p, __m128 a)
+{
+    *p = vreinterpret_m64_f32(vget_low_f32(a));
+}
+
+// Store 4 single-precision (32-bit) floating-point elements from a into memory
+// in reverse order. mem_addr must be aligned on a 16-byte boundary or a
+// general-protection exception may be generated.
+// https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_storer_ps
+FORCE_INLINE void _mm_storer_ps(float *p, __m128 a)
+{
+    float32x4_t tmp = vrev64q_f32(vreinterpretq_f32_m128(a));
+    float32x4_t rev = vextq_f32(tmp, tmp, 2);
+    vst1q_f32(p, rev);
+}
+
+// Store 128-bits (composed of 4 packed single-precision (32-bit) floating-point
+// elements) from a into memory. mem_addr does not need to be aligned on any
+// particular boundary.
+// https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_storeu_ps
+FORCE_INLINE void _mm_storeu_ps(float *p, __m128 a)
+{
+    vst1q_f32(p, vreinterpretq_f32_m128(a));
+}
+
+// Stores 16-bits of integer data a at the address p.
+// https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_storeu_si16
+FORCE_INLINE void _mm_storeu_si16(void *p, __m128i a)
+{
+    vst1q_lane_s16((int16_t *) p, vreinterpretq_s16_m128i(a), 0);
+}
+
+// Stores 64-bits of integer data a at the address p.
+// https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_storeu_si64
+FORCE_INLINE void _mm_storeu_si64(void *p, __m128i a)
+{
+    vst1q_lane_s64((int64_t *) p, vreinterpretq_s64_m128i(a), 0);
+}
+
+// Store 64-bits of integer data from a into memory using a non-temporal memory
+// hint.
+// https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_stream_pi
+FORCE_INLINE void _mm_stream_pi(__m64 *p, __m64 a)
+{
+    vst1_s64((int64_t *) p, vreinterpret_s64_m64(a));
+}
+
+// Store 128-bits (composed of 4 packed single-precision (32-bit) floating-
+// point elements) from a into memory using a non-temporal memory hint.
+// https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_stream_ps
+FORCE_INLINE void _mm_stream_ps(float *p, __m128 a)
+{
+#if __has_builtin(__builtin_nontemporal_store)
+    __builtin_nontemporal_store(a, (float32x4_t *) p);
+#else
+    vst1q_f32(p, vreinterpretq_f32_m128(a));
+#endif
+}
+
+// Subtract packed single-precision (32-bit) floating-point elements in b from
+// packed single-precision (32-bit) floating-point elements in a, and store the
+// results in dst.
+// https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_sub_ps
+FORCE_INLINE __m128 _mm_sub_ps(__m128 a, __m128 b)
+{
+    return vreinterpretq_m128_f32(
+        vsubq_f32(vreinterpretq_f32_m128(a), vreinterpretq_f32_m128(b)));
+}
+
+// Subtract the lower single-precision (32-bit) floating-point element in b from
+// the lower single-precision (32-bit) floating-point element in a, store the
+// result in the lower element of dst, and copy the upper 3 packed elements from
+// a to the upper elements of dst.
+// https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_sub_ss
+FORCE_INLINE __m128 _mm_sub_ss(__m128 a, __m128 b)
+{
+    return _mm_move_ss(a, _mm_sub_ps(a, b));
+}
+
+// Macro: Transpose the 4x4 matrix formed by the 4 rows of single-precision
+// (32-bit) floating-point elements in row0, row1, row2, and row3, and store the
+// transposed matrix in these vectors (row0 now contains column 0, etc.).
+// https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=MM_TRANSPOSE4_PS
+#define _MM_TRANSPOSE4_PS(row0, row1, row2, row3)         \
+    do {                                                  \
+        float32x4x2_t ROW01 = vtrnq_f32(row0, row1);      \
+        float32x4x2_t ROW23 = vtrnq_f32(row2, row3);      \
+        row0 = vcombine_f32(vget_low_f32(ROW01.val[0]),   \
+                            vget_low_f32(ROW23.val[0]));  \
+        row1 = vcombine_f32(vget_low_f32(ROW01.val[1]),   \
+                            vget_low_f32(ROW23.val[1]));  \
+        row2 = vcombine_f32(vget_high_f32(ROW01.val[0]),  \
+                            vget_high_f32(ROW23.val[0])); \
+        row3 = vcombine_f32(vget_high_f32(ROW01.val[1]),  \
+                            vget_high_f32(ROW23.val[1])); \
+    } while (0)
+
+// according to the documentation, these intrinsics behave the same as the
+// non-'u' versions.  We'll just alias them here.
+#define _mm_ucomieq_ss _mm_comieq_ss
+#define _mm_ucomige_ss _mm_comige_ss
+#define _mm_ucomigt_ss _mm_comigt_ss
+#define _mm_ucomile_ss _mm_comile_ss
+#define _mm_ucomilt_ss _mm_comilt_ss
+#define _mm_ucomineq_ss _mm_comineq_ss
+
+// Return vector of type __m128i with undefined elements.
+// https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=mm_undefined_si128
+FORCE_INLINE __m128i _mm_undefined_si128(void)
+{
+#if defined(__GNUC__) || defined(__clang__)
+#pragma GCC diagnostic push
+#pragma GCC diagnostic ignored "-Wuninitialized"
+#endif
+    __m128i a;
+#if defined(_MSC_VER)
+    a = _mm_setzero_si128();
+#endif
+    return a;
+#if defined(__GNUC__) || defined(__clang__)
+#pragma GCC diagnostic pop
+#endif
+}
+
+// Return vector of type __m128 with undefined elements.
+// https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_undefined_ps
+FORCE_INLINE __m128 _mm_undefined_ps(void)
+{
+#if defined(__GNUC__) || defined(__clang__)
+#pragma GCC diagnostic push
+#pragma GCC diagnostic ignored "-Wuninitialized"
+#endif
+    __m128 a;
+#if defined(_MSC_VER)
+    a = _mm_setzero_ps();
+#endif
+    return a;
+#if defined(__GNUC__) || defined(__clang__)
+#pragma GCC diagnostic pop
+#endif
+}
+
+// Unpack and interleave single-precision (32-bit) floating-point elements from
+// the high half a and b, and store the results in dst.
+// https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_unpackhi_ps
+FORCE_INLINE __m128 _mm_unpackhi_ps(__m128 a, __m128 b)
+{
+#if defined(__aarch64__) || defined(_M_ARM64)
+    return vreinterpretq_m128_f32(
+        vzip2q_f32(vreinterpretq_f32_m128(a), vreinterpretq_f32_m128(b)));
+#else
+    float32x2_t a1 = vget_high_f32(vreinterpretq_f32_m128(a));
+    float32x2_t b1 = vget_high_f32(vreinterpretq_f32_m128(b));
+    float32x2x2_t result = vzip_f32(a1, b1);
+    return vreinterpretq_m128_f32(vcombine_f32(result.val[0], result.val[1]));
+#endif
+}
+
+// Unpack and interleave single-precision (32-bit) floating-point elements from
+// the low half of a and b, and store the results in dst.
+// https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_unpacklo_ps
+FORCE_INLINE __m128 _mm_unpacklo_ps(__m128 a, __m128 b)
+{
+#if defined(__aarch64__) || defined(_M_ARM64)
+    return vreinterpretq_m128_f32(
+        vzip1q_f32(vreinterpretq_f32_m128(a), vreinterpretq_f32_m128(b)));
+#else
+    float32x2_t a1 = vget_low_f32(vreinterpretq_f32_m128(a));
+    float32x2_t b1 = vget_low_f32(vreinterpretq_f32_m128(b));
+    float32x2x2_t result = vzip_f32(a1, b1);
+    return vreinterpretq_m128_f32(vcombine_f32(result.val[0], result.val[1]));
+#endif
+}
+
+// Compute the bitwise XOR of packed single-precision (32-bit) floating-point
+// elements in a and b, and store the results in dst.
+// https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_xor_ps
+FORCE_INLINE __m128 _mm_xor_ps(__m128 a, __m128 b)
+{
+    return vreinterpretq_m128_s32(
+        veorq_s32(vreinterpretq_s32_m128(a), vreinterpretq_s32_m128(b)));
+}
+
+/* SSE2 */
+
+// Add packed 16-bit integers in a and b, and store the results in dst.
+// https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_add_epi16
+FORCE_INLINE __m128i _mm_add_epi16(__m128i a, __m128i b)
+{
+    return vreinterpretq_m128i_s16(
+        vaddq_s16(vreinterpretq_s16_m128i(a), vreinterpretq_s16_m128i(b)));
+}
+
+// Add packed 32-bit integers in a and b, and store the results in dst.
+// https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_add_epi32
+FORCE_INLINE __m128i _mm_add_epi32(__m128i a, __m128i b)
+{
+    return vreinterpretq_m128i_s32(
+        vaddq_s32(vreinterpretq_s32_m128i(a), vreinterpretq_s32_m128i(b)));
+}
+
+// Add packed 64-bit integers in a and b, and store the results in dst.
+// https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_add_epi64
+FORCE_INLINE __m128i _mm_add_epi64(__m128i a, __m128i b)
+{
+    return vreinterpretq_m128i_s64(
+        vaddq_s64(vreinterpretq_s64_m128i(a), vreinterpretq_s64_m128i(b)));
+}
+
+// Add packed 8-bit integers in a and b, and store the results in dst.
+// https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_add_epi8
+FORCE_INLINE __m128i _mm_add_epi8(__m128i a, __m128i b)
+{
+    return vreinterpretq_m128i_s8(
+        vaddq_s8(vreinterpretq_s8_m128i(a), vreinterpretq_s8_m128i(b)));
+}
+
+// Add packed double-precision (64-bit) floating-point elements in a and b, and
+// store the results in dst.
+// https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_add_pd
+FORCE_INLINE __m128d _mm_add_pd(__m128d a, __m128d b)
+{
+#if defined(__aarch64__) || defined(_M_ARM64)
+    return vreinterpretq_m128d_f64(
+        vaddq_f64(vreinterpretq_f64_m128d(a), vreinterpretq_f64_m128d(b)));
+#else
+    double a0 =
+        sse2neon_recast_u64_f64(vgetq_lane_u64(vreinterpretq_u64_m128d(a), 0));
+    double a1 =
+        sse2neon_recast_u64_f64(vgetq_lane_u64(vreinterpretq_u64_m128d(a), 1));
+    double b0 =
+        sse2neon_recast_u64_f64(vgetq_lane_u64(vreinterpretq_u64_m128d(b), 0));
+    double b1 =
+        sse2neon_recast_u64_f64(vgetq_lane_u64(vreinterpretq_u64_m128d(b), 1));
+    double c[2];
+    c[0] = a0 + b0;
+    c[1] = a1 + b1;
+    return vld1q_f32((float32_t *) c);
+#endif
+}
+
+// Add the lower double-precision (64-bit) floating-point element in a and b,
+// store the result in the lower element of dst, and copy the upper element from
+// a to the upper element of dst.
+// https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_add_sd
+FORCE_INLINE __m128d _mm_add_sd(__m128d a, __m128d b)
+{
+#if defined(__aarch64__) || defined(_M_ARM64)
+    return _mm_move_sd(a, _mm_add_pd(a, b));
+#else
+    double a0, a1, b0;
+    a0 = sse2neon_recast_u64_f64(vgetq_lane_u64(vreinterpretq_u64_m128d(a), 0));
+    a1 = sse2neon_recast_u64_f64(vgetq_lane_u64(vreinterpretq_u64_m128d(a), 1));
+    b0 = sse2neon_recast_u64_f64(vgetq_lane_u64(vreinterpretq_u64_m128d(b), 0));
+    double c[2];
+    c[0] = a0 + b0;
+    c[1] = a1;
+    return vld1q_f32((float32_t *) c);
+#endif
+}
+
+// Add 64-bit integers a and b, and store the result in dst.
+// https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_add_si64
+FORCE_INLINE __m64 _mm_add_si64(__m64 a, __m64 b)
+{
+    return vreinterpret_m64_s64(
+        vadd_s64(vreinterpret_s64_m64(a), vreinterpret_s64_m64(b)));
+}
+
+// Add packed signed 16-bit integers in a and b using saturation, and store the
+// results in dst.
+// https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_adds_epi16
+FORCE_INLINE __m128i _mm_adds_epi16(__m128i a, __m128i b)
+{
+    return vreinterpretq_m128i_s16(
+        vqaddq_s16(vreinterpretq_s16_m128i(a), vreinterpretq_s16_m128i(b)));
+}
+
+// Add packed signed 8-bit integers in a and b using saturation, and store the
+// results in dst.
+// https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_adds_epi8
+FORCE_INLINE __m128i _mm_adds_epi8(__m128i a, __m128i b)
+{
+    return vreinterpretq_m128i_s8(
+        vqaddq_s8(vreinterpretq_s8_m128i(a), vreinterpretq_s8_m128i(b)));
+}
+
+// Add packed unsigned 16-bit integers in a and b using saturation, and store
+// the results in dst.
+// https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_adds_epu16
+FORCE_INLINE __m128i _mm_adds_epu16(__m128i a, __m128i b)
+{
+    return vreinterpretq_m128i_u16(
+        vqaddq_u16(vreinterpretq_u16_m128i(a), vreinterpretq_u16_m128i(b)));
+}
+
+// Add packed unsigned 8-bit integers in a and b using saturation, and store the
+// results in dst.
+// https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_adds_epu8
+FORCE_INLINE __m128i _mm_adds_epu8(__m128i a, __m128i b)
+{
+    return vreinterpretq_m128i_u8(
+        vqaddq_u8(vreinterpretq_u8_m128i(a), vreinterpretq_u8_m128i(b)));
+}
+
+// Compute the bitwise AND of packed double-precision (64-bit) floating-point
+// elements in a and b, and store the results in dst.
+// https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_and_pd
+FORCE_INLINE __m128d _mm_and_pd(__m128d a, __m128d b)
+{
+    return vreinterpretq_m128d_s64(
+        vandq_s64(vreinterpretq_s64_m128d(a), vreinterpretq_s64_m128d(b)));
+}
+
+// Compute the bitwise AND of 128 bits (representing integer data) in a and b,
+// and store the result in dst.
+// https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_and_si128
+FORCE_INLINE __m128i _mm_and_si128(__m128i a, __m128i b)
+{
+    return vreinterpretq_m128i_s32(
+        vandq_s32(vreinterpretq_s32_m128i(a), vreinterpretq_s32_m128i(b)));
+}
+
+// Compute the bitwise NOT of packed double-precision (64-bit) floating-point
+// elements in a and then AND with b, and store the results in dst.
+// https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_andnot_pd
+FORCE_INLINE __m128d _mm_andnot_pd(__m128d a, __m128d b)
+{
+    // *NOTE* argument swap
+    return vreinterpretq_m128d_s64(
+        vbicq_s64(vreinterpretq_s64_m128d(b), vreinterpretq_s64_m128d(a)));
+}
+
+// Compute the bitwise NOT of 128 bits (representing integer data) in a and then
+// AND with b, and store the result in dst.
+// https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_andnot_si128
+FORCE_INLINE __m128i _mm_andnot_si128(__m128i a, __m128i b)
+{
+    return vreinterpretq_m128i_s32(
+        vbicq_s32(vreinterpretq_s32_m128i(b),
+                  vreinterpretq_s32_m128i(a)));  // *NOTE* argument swap
+}
+
+// Average packed unsigned 16-bit integers in a and b, and store the results in
+// dst.
+// https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_avg_epu16
+FORCE_INLINE __m128i _mm_avg_epu16(__m128i a, __m128i b)
+{
+    return (__m128i) vrhaddq_u16(vreinterpretq_u16_m128i(a),
+                                 vreinterpretq_u16_m128i(b));
+}
+
+// Average packed unsigned 8-bit integers in a and b, and store the results in
+// dst.
+// https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_avg_epu8
+FORCE_INLINE __m128i _mm_avg_epu8(__m128i a, __m128i b)
+{
+    return vreinterpretq_m128i_u8(
+        vrhaddq_u8(vreinterpretq_u8_m128i(a), vreinterpretq_u8_m128i(b)));
+}
+
+// Shift a left by imm8 bytes while shifting in zeros, and store the results in
+// dst.
+// https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_bslli_si128
+#define _mm_bslli_si128(a, imm) _mm_slli_si128(a, imm)
+
+// Shift a right by imm8 bytes while shifting in zeros, and store the results in
+// dst.
+// https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_bsrli_si128
+#define _mm_bsrli_si128(a, imm) _mm_srli_si128(a, imm)
+
+// Cast vector of type __m128d to type __m128. This intrinsic is only used for
+// compilation and does not generate any instructions, thus it has zero latency.
+// https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_castpd_ps
+FORCE_INLINE __m128 _mm_castpd_ps(__m128d a)
+{
+    return vreinterpretq_m128_s64(vreinterpretq_s64_m128d(a));
+}
+
+// Cast vector of type __m128d to type __m128i. This intrinsic is only used for
+// compilation and does not generate any instructions, thus it has zero latency.
+// https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_castpd_si128
+FORCE_INLINE __m128i _mm_castpd_si128(__m128d a)
+{
+    return vreinterpretq_m128i_s64(vreinterpretq_s64_m128d(a));
+}
+
+// Cast vector of type __m128 to type __m128d. This intrinsic is only used for
+// compilation and does not generate any instructions, thus it has zero latency.
+// https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_castps_pd
+FORCE_INLINE __m128d _mm_castps_pd(__m128 a)
+{
+    return vreinterpretq_m128d_s32(vreinterpretq_s32_m128(a));
+}
+
+// Cast vector of type __m128 to type __m128i. This intrinsic is only used for
+// compilation and does not generate any instructions, thus it has zero latency.
+// https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_castps_si128
+FORCE_INLINE __m128i _mm_castps_si128(__m128 a)
+{
+    return vreinterpretq_m128i_s32(vreinterpretq_s32_m128(a));
+}
+
+// Cast vector of type __m128i to type __m128d. This intrinsic is only used for
+// compilation and does not generate any instructions, thus it has zero latency.
+// https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_castsi128_pd
+FORCE_INLINE __m128d _mm_castsi128_pd(__m128i a)
+{
+#if defined(__aarch64__) || defined(_M_ARM64)
+    return vreinterpretq_m128d_f64(vreinterpretq_f64_m128i(a));
+#else
+    return vreinterpretq_m128d_f32(vreinterpretq_f32_m128i(a));
+#endif
+}
+
+// Cast vector of type __m128i to type __m128. This intrinsic is only used for
+// compilation and does not generate any instructions, thus it has zero latency.
+// https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_castsi128_ps
+FORCE_INLINE __m128 _mm_castsi128_ps(__m128i a)
+{
+    return vreinterpretq_m128_s32(vreinterpretq_s32_m128i(a));
+}
+
+// Invalidate and flush the cache line that contains p from all levels of the
+// cache hierarchy.
+// https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_clflush
+#if defined(__APPLE__)
+#include <libkern/OSCacheControl.h>
+#endif
+FORCE_INLINE void _mm_clflush(void const *p)
+{
+    (void) p;
+
+    /* sys_icache_invalidate is supported since macOS 10.5.
+     * However, it does not work on non-jailbroken iOS devices, although the
+     * compilation is successful.
+     */
+#if defined(__APPLE__)
+    sys_icache_invalidate((void *) (uintptr_t) p, SSE2NEON_CACHELINE_SIZE);
+#elif defined(__GNUC__) || defined(__clang__)
+    uintptr_t ptr = (uintptr_t) p;
+    __builtin___clear_cache((char *) ptr,
+                            (char *) ptr + SSE2NEON_CACHELINE_SIZE);
+#elif (_MSC_VER) && SSE2NEON_INCLUDE_WINDOWS_H
+    FlushInstructionCache(GetCurrentProcess(), p, SSE2NEON_CACHELINE_SIZE);
+#endif
+}
+
+// Compare packed 16-bit integers in a and b for equality, and store the results
+// in dst.
+// https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_cmpeq_epi16
+FORCE_INLINE __m128i _mm_cmpeq_epi16(__m128i a, __m128i b)
+{
+    return vreinterpretq_m128i_u16(
+        vceqq_s16(vreinterpretq_s16_m128i(a), vreinterpretq_s16_m128i(b)));
+}
+
+// Compare packed 32-bit integers in a and b for equality, and store the results
+// in dst.
+// https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_cmpeq_epi32
+FORCE_INLINE __m128i _mm_cmpeq_epi32(__m128i a, __m128i b)
+{
+    return vreinterpretq_m128i_u32(
+        vceqq_s32(vreinterpretq_s32_m128i(a), vreinterpretq_s32_m128i(b)));
+}
+
+// Compare packed 8-bit integers in a and b for equality, and store the results
+// in dst.
+// https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_cmpeq_epi8
+FORCE_INLINE __m128i _mm_cmpeq_epi8(__m128i a, __m128i b)
+{
+    return vreinterpretq_m128i_u8(
+        vceqq_s8(vreinterpretq_s8_m128i(a), vreinterpretq_s8_m128i(b)));
+}
+
+// Compare packed double-precision (64-bit) floating-point elements in a and b
+// for equality, and store the results in dst.
+// https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_cmpeq_pd
+FORCE_INLINE __m128d _mm_cmpeq_pd(__m128d a, __m128d b)
+{
+#if defined(__aarch64__) || defined(_M_ARM64)
+    return vreinterpretq_m128d_u64(
+        vceqq_f64(vreinterpretq_f64_m128d(a), vreinterpretq_f64_m128d(b)));
+#else
+    // (a == b) -> (a_lo == b_lo) && (a_hi == b_hi)
+    uint32x4_t cmp =
+        vceqq_u32(vreinterpretq_u32_m128d(a), vreinterpretq_u32_m128d(b));
+    uint32x4_t swapped = vrev64q_u32(cmp);
+    return vreinterpretq_m128d_u32(vandq_u32(cmp, swapped));
+#endif
+}
+
+// Compare the lower double-precision (64-bit) floating-point elements in a and
+// b for equality, store the result in the lower element of dst, and copy the
+// upper element from a to the upper element of dst.
+// https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_cmpeq_sd
+FORCE_INLINE __m128d _mm_cmpeq_sd(__m128d a, __m128d b)
+{
+    return _mm_move_sd(a, _mm_cmpeq_pd(a, b));
+}
+
+// Compare packed double-precision (64-bit) floating-point elements in a and b
+// for greater-than-or-equal, and store the results in dst.
+// https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_cmpge_pd
+FORCE_INLINE __m128d _mm_cmpge_pd(__m128d a, __m128d b)
+{
+#if defined(__aarch64__) || defined(_M_ARM64)
+    return vreinterpretq_m128d_u64(
+        vcgeq_f64(vreinterpretq_f64_m128d(a), vreinterpretq_f64_m128d(b)));
+#else
+    double a0 =
+        sse2neon_recast_u64_f64(vgetq_lane_u64(vreinterpretq_u64_m128d(a), 0));
+    double a1 =
+        sse2neon_recast_u64_f64(vgetq_lane_u64(vreinterpretq_u64_m128d(a), 1));
+    double b0 =
+        sse2neon_recast_u64_f64(vgetq_lane_u64(vreinterpretq_u64_m128d(b), 0));
+    double b1 =
+        sse2neon_recast_u64_f64(vgetq_lane_u64(vreinterpretq_u64_m128d(b), 1));
+    uint64_t d[2];
+    d[0] = a0 >= b0 ? ~UINT64_C(0) : UINT64_C(0);
+    d[1] = a1 >= b1 ? ~UINT64_C(0) : UINT64_C(0);
+
+    return vreinterpretq_m128d_u64(vld1q_u64(d));
+#endif
+}
+
+// Compare the lower double-precision (64-bit) floating-point elements in a and
+// b for greater-than-or-equal, store the result in the lower element of dst,
+// and copy the upper element from a to the upper element of dst.
+// https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_cmpge_sd
+FORCE_INLINE __m128d _mm_cmpge_sd(__m128d a, __m128d b)
+{
+#if defined(__aarch64__) || defined(_M_ARM64)
+    return _mm_move_sd(a, _mm_cmpge_pd(a, b));
+#else
+    // expand "_mm_cmpge_pd()" to reduce unnecessary operations
+    double a0, b0;
+    a0 = sse2neon_recast_u64_f64(vgetq_lane_u64(vreinterpretq_u64_m128d(a), 0));
+    uint64_t a1 = vgetq_lane_u64(vreinterpretq_u64_m128d(a), 1);
+    b0 = sse2neon_recast_u64_f64(vgetq_lane_u64(vreinterpretq_u64_m128d(b), 0));
+    uint64_t d[2];
+    d[0] = a0 >= b0 ? ~UINT64_C(0) : UINT64_C(0);
+    d[1] = a1;
+
+    return vreinterpretq_m128d_u64(vld1q_u64(d));
+#endif
+}
+
+// Compare packed signed 16-bit integers in a and b for greater-than, and store
+// the results in dst.
+// https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_cmpgt_epi16
+FORCE_INLINE __m128i _mm_cmpgt_epi16(__m128i a, __m128i b)
+{
+    return vreinterpretq_m128i_u16(
+        vcgtq_s16(vreinterpretq_s16_m128i(a), vreinterpretq_s16_m128i(b)));
+}
+
+// Compare packed signed 32-bit integers in a and b for greater-than, and store
+// the results in dst.
+// https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_cmpgt_epi32
+FORCE_INLINE __m128i _mm_cmpgt_epi32(__m128i a, __m128i b)
+{
+    return vreinterpretq_m128i_u32(
+        vcgtq_s32(vreinterpretq_s32_m128i(a), vreinterpretq_s32_m128i(b)));
+}
+
+// Compare packed signed 8-bit integers in a and b for greater-than, and store
+// the results in dst.
+// https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_cmpgt_epi8
+FORCE_INLINE __m128i _mm_cmpgt_epi8(__m128i a, __m128i b)
+{
+    return vreinterpretq_m128i_u8(
+        vcgtq_s8(vreinterpretq_s8_m128i(a), vreinterpretq_s8_m128i(b)));
+}
+
+// Compare packed double-precision (64-bit) floating-point elements in a and b
+// for greater-than, and store the results in dst.
+// https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_cmpgt_pd
+FORCE_INLINE __m128d _mm_cmpgt_pd(__m128d a, __m128d b)
+{
+#if defined(__aarch64__) || defined(_M_ARM64)
+    return vreinterpretq_m128d_u64(
+        vcgtq_f64(vreinterpretq_f64_m128d(a), vreinterpretq_f64_m128d(b)));
+#else
+    double a0 =
+        sse2neon_recast_u64_f64(vgetq_lane_u64(vreinterpretq_u64_m128d(a), 0));
+    double a1 =
+        sse2neon_recast_u64_f64(vgetq_lane_u64(vreinterpretq_u64_m128d(a), 1));
+    double b0 =
+        sse2neon_recast_u64_f64(vgetq_lane_u64(vreinterpretq_u64_m128d(b), 0));
+    double b1 =
+        sse2neon_recast_u64_f64(vgetq_lane_u64(vreinterpretq_u64_m128d(b), 1));
+    uint64_t d[2];
+    d[0] = a0 > b0 ? ~UINT64_C(0) : UINT64_C(0);
+    d[1] = a1 > b1 ? ~UINT64_C(0) : UINT64_C(0);
+
+    return vreinterpretq_m128d_u64(vld1q_u64(d));
+#endif
+}
+
+// Compare the lower double-precision (64-bit) floating-point elements in a and
+// b for greater-than, store the result in the lower element of dst, and copy
+// the upper element from a to the upper element of dst.
+// https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_cmpgt_sd
+FORCE_INLINE __m128d _mm_cmpgt_sd(__m128d a, __m128d b)
+{
+#if defined(__aarch64__) || defined(_M_ARM64)
+    return _mm_move_sd(a, _mm_cmpgt_pd(a, b));
+#else
+    // expand "_mm_cmpge_pd()" to reduce unnecessary operations
+    double a0, b0;
+    a0 = sse2neon_recast_u64_f64(vgetq_lane_u64(vreinterpretq_u64_m128d(a), 0));
+    uint64_t a1 = vgetq_lane_u64(vreinterpretq_u64_m128d(a), 1);
+    b0 = sse2neon_recast_u64_f64(vgetq_lane_u64(vreinterpretq_u64_m128d(b), 0));
+    uint64_t d[2];
+    d[0] = a0 > b0 ? ~UINT64_C(0) : UINT64_C(0);
+    d[1] = a1;
+
+    return vreinterpretq_m128d_u64(vld1q_u64(d));
+#endif
+}
+
+// Compare packed double-precision (64-bit) floating-point elements in a and b
+// for less-than-or-equal, and store the results in dst.
+// https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_cmple_pd
+FORCE_INLINE __m128d _mm_cmple_pd(__m128d a, __m128d b)
+{
+#if defined(__aarch64__) || defined(_M_ARM64)
+    return vreinterpretq_m128d_u64(
+        vcleq_f64(vreinterpretq_f64_m128d(a), vreinterpretq_f64_m128d(b)));
+#else
+    double a0 =
+        sse2neon_recast_u64_f64(vgetq_lane_u64(vreinterpretq_u64_m128d(a), 0));
+    double a1 =
+        sse2neon_recast_u64_f64(vgetq_lane_u64(vreinterpretq_u64_m128d(a), 1));
+    double b0 =
+        sse2neon_recast_u64_f64(vgetq_lane_u64(vreinterpretq_u64_m128d(b), 0));
+    double b1 =
+        sse2neon_recast_u64_f64(vgetq_lane_u64(vreinterpretq_u64_m128d(b), 1));
+    uint64_t d[2];
+    d[0] = a0 <= b0 ? ~UINT64_C(0) : UINT64_C(0);
+    d[1] = a1 <= b1 ? ~UINT64_C(0) : UINT64_C(0);
+
+    return vreinterpretq_m128d_u64(vld1q_u64(d));
+#endif
+}
+
+// Compare the lower double-precision (64-bit) floating-point elements in a and
+// b for less-than-or-equal, store the result in the lower element of dst, and
+// copy the upper element from a to the upper element of dst.
+// https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_cmple_sd
+FORCE_INLINE __m128d _mm_cmple_sd(__m128d a, __m128d b)
+{
+#if defined(__aarch64__) || defined(_M_ARM64)
+    return _mm_move_sd(a, _mm_cmple_pd(a, b));
+#else
+    // expand "_mm_cmpge_pd()" to reduce unnecessary operations
+    double a0, b0;
+    a0 = sse2neon_recast_u64_f64(vgetq_lane_u64(vreinterpretq_u64_m128d(a), 0));
+    uint64_t a1 = vgetq_lane_u64(vreinterpretq_u64_m128d(a), 1);
+    b0 = sse2neon_recast_u64_f64(vgetq_lane_u64(vreinterpretq_u64_m128d(b), 0));
+    uint64_t d[2];
+    d[0] = a0 <= b0 ? ~UINT64_C(0) : UINT64_C(0);
+    d[1] = a1;
+
+    return vreinterpretq_m128d_u64(vld1q_u64(d));
+#endif
+}
+
+// Compare packed signed 16-bit integers in a and b for less-than, and store the
+// results in dst. Note: This intrinsic emits the pcmpgtw instruction with the
+// order of the operands switched.
+// https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_cmplt_epi16
+FORCE_INLINE __m128i _mm_cmplt_epi16(__m128i a, __m128i b)
+{
+    return vreinterpretq_m128i_u16(
+        vcltq_s16(vreinterpretq_s16_m128i(a), vreinterpretq_s16_m128i(b)));
+}
+
+// Compare packed signed 32-bit integers in a and b for less-than, and store the
+// results in dst. Note: This intrinsic emits the pcmpgtd instruction with the
+// order of the operands switched.
+// https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_cmplt_epi32
+FORCE_INLINE __m128i _mm_cmplt_epi32(__m128i a, __m128i b)
+{
+    return vreinterpretq_m128i_u32(
+        vcltq_s32(vreinterpretq_s32_m128i(a), vreinterpretq_s32_m128i(b)));
+}
+
+// Compare packed signed 8-bit integers in a and b for less-than, and store the
+// results in dst. Note: This intrinsic emits the pcmpgtb instruction with the
+// order of the operands switched.
+// https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_cmplt_epi8
+FORCE_INLINE __m128i _mm_cmplt_epi8(__m128i a, __m128i b)
+{
+    return vreinterpretq_m128i_u8(
+        vcltq_s8(vreinterpretq_s8_m128i(a), vreinterpretq_s8_m128i(b)));
+}
+
+// Compare packed double-precision (64-bit) floating-point elements in a and b
+// for less-than, and store the results in dst.
+// https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_cmplt_pd
+FORCE_INLINE __m128d _mm_cmplt_pd(__m128d a, __m128d b)
+{
+#if defined(__aarch64__) || defined(_M_ARM64)
+    return vreinterpretq_m128d_u64(
+        vcltq_f64(vreinterpretq_f64_m128d(a), vreinterpretq_f64_m128d(b)));
+#else
+    double a0 =
+        sse2neon_recast_u64_f64(vgetq_lane_u64(vreinterpretq_u64_m128d(a), 0));
+    double a1 =
+        sse2neon_recast_u64_f64(vgetq_lane_u64(vreinterpretq_u64_m128d(a), 1));
+    double b0 =
+        sse2neon_recast_u64_f64(vgetq_lane_u64(vreinterpretq_u64_m128d(b), 0));
+    double b1 =
+        sse2neon_recast_u64_f64(vgetq_lane_u64(vreinterpretq_u64_m128d(b), 1));
+    uint64_t d[2];
+    d[0] = a0 < b0 ? ~UINT64_C(0) : UINT64_C(0);
+    d[1] = a1 < b1 ? ~UINT64_C(0) : UINT64_C(0);
+
+    return vreinterpretq_m128d_u64(vld1q_u64(d));
+#endif
+}
+
+// Compare the lower double-precision (64-bit) floating-point elements in a and
+// b for less-than, store the result in the lower element of dst, and copy the
+// upper element from a to the upper element of dst.
+// https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_cmplt_sd
+FORCE_INLINE __m128d _mm_cmplt_sd(__m128d a, __m128d b)
+{
+#if defined(__aarch64__) || defined(_M_ARM64)
+    return _mm_move_sd(a, _mm_cmplt_pd(a, b));
+#else
+    double a0, b0;
+    a0 = sse2neon_recast_u64_f64(vgetq_lane_u64(vreinterpretq_u64_m128d(a), 0));
+    uint64_t a1 = vgetq_lane_u64(vreinterpretq_u64_m128d(a), 1);
+    b0 = sse2neon_recast_u64_f64(vgetq_lane_u64(vreinterpretq_u64_m128d(b), 0));
+    uint64_t d[2];
+    d[0] = a0 < b0 ? ~UINT64_C(0) : UINT64_C(0);
+    d[1] = a1;
+
+    return vreinterpretq_m128d_u64(vld1q_u64(d));
+#endif
+}
+
+// Compare packed double-precision (64-bit) floating-point elements in a and b
+// for not-equal, and store the results in dst.
+// https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_cmpneq_pd
+FORCE_INLINE __m128d _mm_cmpneq_pd(__m128d a, __m128d b)
+{
+#if defined(__aarch64__) || defined(_M_ARM64)
+    return vreinterpretq_m128d_s32(vmvnq_s32(vreinterpretq_s32_u64(
+        vceqq_f64(vreinterpretq_f64_m128d(a), vreinterpretq_f64_m128d(b)))));
+#else
+    // (a == b) -> (a_lo == b_lo) && (a_hi == b_hi)
+    uint32x4_t cmp =
+        vceqq_u32(vreinterpretq_u32_m128d(a), vreinterpretq_u32_m128d(b));
+    uint32x4_t swapped = vrev64q_u32(cmp);
+    return vreinterpretq_m128d_u32(vmvnq_u32(vandq_u32(cmp, swapped)));
+#endif
+}
+
+// Compare the lower double-precision (64-bit) floating-point elements in a and
+// b for not-equal, store the result in the lower element of dst, and copy the
+// upper element from a to the upper element of dst.
+// https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_cmpneq_sd
+FORCE_INLINE __m128d _mm_cmpneq_sd(__m128d a, __m128d b)
+{
+    return _mm_move_sd(a, _mm_cmpneq_pd(a, b));
+}
+
+// Compare packed double-precision (64-bit) floating-point elements in a and b
+// for not-greater-than-or-equal, and store the results in dst.
+// https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_cmpnge_pd
+FORCE_INLINE __m128d _mm_cmpnge_pd(__m128d a, __m128d b)
+{
+#if defined(__aarch64__) || defined(_M_ARM64)
+    return vreinterpretq_m128d_u64(veorq_u64(
+        vcgeq_f64(vreinterpretq_f64_m128d(a), vreinterpretq_f64_m128d(b)),
+        vdupq_n_u64(UINT64_MAX)));
+#else
+    double a0 =
+        sse2neon_recast_u64_f64(vgetq_lane_u64(vreinterpretq_u64_m128d(a), 0));
+    double a1 =
+        sse2neon_recast_u64_f64(vgetq_lane_u64(vreinterpretq_u64_m128d(a), 1));
+    double b0 =
+        sse2neon_recast_u64_f64(vgetq_lane_u64(vreinterpretq_u64_m128d(b), 0));
+    double b1 =
+        sse2neon_recast_u64_f64(vgetq_lane_u64(vreinterpretq_u64_m128d(b), 1));
+    uint64_t d[2];
+    d[0] = !(a0 >= b0) ? ~UINT64_C(0) : UINT64_C(0);
+    d[1] = !(a1 >= b1) ? ~UINT64_C(0) : UINT64_C(0);
+
+    return vreinterpretq_m128d_u64(vld1q_u64(d));
+#endif
+}
+
+// Compare the lower double-precision (64-bit) floating-point elements in a and
+// b for not-greater-than-or-equal, store the result in the lower element of
+// dst, and copy the upper element from a to the upper element of dst.
+// https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_cmpnge_sd
+FORCE_INLINE __m128d _mm_cmpnge_sd(__m128d a, __m128d b)
+{
+    return _mm_move_sd(a, _mm_cmpnge_pd(a, b));
+}
+
+// Compare packed double-precision (64-bit) floating-point elements in a and b
+// for not-greater-than, and store the results in dst.
+// https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_cmpngt_pd
+FORCE_INLINE __m128d _mm_cmpngt_pd(__m128d a, __m128d b)
+{
+#if defined(__aarch64__) || defined(_M_ARM64)
+    return vreinterpretq_m128d_u64(veorq_u64(
+        vcgtq_f64(vreinterpretq_f64_m128d(a), vreinterpretq_f64_m128d(b)),
+        vdupq_n_u64(UINT64_MAX)));
+#else
+    double a0 =
+        sse2neon_recast_u64_f64(vgetq_lane_u64(vreinterpretq_u64_m128d(a), 0));
+    double a1 =
+        sse2neon_recast_u64_f64(vgetq_lane_u64(vreinterpretq_u64_m128d(a), 1));
+    double b0 =
+        sse2neon_recast_u64_f64(vgetq_lane_u64(vreinterpretq_u64_m128d(b), 0));
+    double b1 =
+        sse2neon_recast_u64_f64(vgetq_lane_u64(vreinterpretq_u64_m128d(b), 1));
+    uint64_t d[2];
+    d[0] = !(a0 > b0) ? ~UINT64_C(0) : UINT64_C(0);
+    d[1] = !(a1 > b1) ? ~UINT64_C(0) : UINT64_C(0);
+
+    return vreinterpretq_m128d_u64(vld1q_u64(d));
+#endif
+}
+
+// Compare the lower double-precision (64-bit) floating-point elements in a and
+// b for not-greater-than, store the result in the lower element of dst, and
+// copy the upper element from a to the upper element of dst.
+// https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_cmpngt_sd
+FORCE_INLINE __m128d _mm_cmpngt_sd(__m128d a, __m128d b)
+{
+    return _mm_move_sd(a, _mm_cmpngt_pd(a, b));
+}
+
+// Compare packed double-precision (64-bit) floating-point elements in a and b
+// for not-less-than-or-equal, and store the results in dst.
+// https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_cmpnle_pd
+FORCE_INLINE __m128d _mm_cmpnle_pd(__m128d a, __m128d b)
+{
+#if defined(__aarch64__) || defined(_M_ARM64)
+    return vreinterpretq_m128d_u64(veorq_u64(
+        vcleq_f64(vreinterpretq_f64_m128d(a), vreinterpretq_f64_m128d(b)),
+        vdupq_n_u64(UINT64_MAX)));
+#else
+    double a0 =
+        sse2neon_recast_u64_f64(vgetq_lane_u64(vreinterpretq_u64_m128d(a), 0));
+    double a1 =
+        sse2neon_recast_u64_f64(vgetq_lane_u64(vreinterpretq_u64_m128d(a), 1));
+    double b0 =
+        sse2neon_recast_u64_f64(vgetq_lane_u64(vreinterpretq_u64_m128d(b), 0));
+    double b1 =
+        sse2neon_recast_u64_f64(vgetq_lane_u64(vreinterpretq_u64_m128d(b), 1));
+    uint64_t d[2];
+    d[0] = !(a0 <= b0) ? ~UINT64_C(0) : UINT64_C(0);
+    d[1] = !(a1 <= b1) ? ~UINT64_C(0) : UINT64_C(0);
+
+    return vreinterpretq_m128d_u64(vld1q_u64(d));
+#endif
+}
+
+// Compare the lower double-precision (64-bit) floating-point elements in a and
+// b for not-less-than-or-equal, store the result in the lower element of dst,
+// and copy the upper element from a to the upper element of dst.
+// https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_cmpnle_sd
+FORCE_INLINE __m128d _mm_cmpnle_sd(__m128d a, __m128d b)
+{
+    return _mm_move_sd(a, _mm_cmpnle_pd(a, b));
+}
+
+// Compare packed double-precision (64-bit) floating-point elements in a and b
+// for not-less-than, and store the results in dst.
+// https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_cmpnlt_pd
+FORCE_INLINE __m128d _mm_cmpnlt_pd(__m128d a, __m128d b)
+{
+#if defined(__aarch64__) || defined(_M_ARM64)
+    return vreinterpretq_m128d_u64(veorq_u64(
+        vcltq_f64(vreinterpretq_f64_m128d(a), vreinterpretq_f64_m128d(b)),
+        vdupq_n_u64(UINT64_MAX)));
+#else
+    double a0 =
+        sse2neon_recast_u64_f64(vgetq_lane_u64(vreinterpretq_u64_m128d(a), 0));
+    double a1 =
+        sse2neon_recast_u64_f64(vgetq_lane_u64(vreinterpretq_u64_m128d(a), 1));
+    double b0 =
+        sse2neon_recast_u64_f64(vgetq_lane_u64(vreinterpretq_u64_m128d(b), 0));
+    double b1 =
+        sse2neon_recast_u64_f64(vgetq_lane_u64(vreinterpretq_u64_m128d(b), 1));
+    uint64_t d[2];
+    d[0] = !(a0 < b0) ? ~UINT64_C(0) : UINT64_C(0);
+    d[1] = !(a1 < b1) ? ~UINT64_C(0) : UINT64_C(0);
+
+    return vreinterpretq_m128d_u64(vld1q_u64(d));
+#endif
+}
+
+// Compare the lower double-precision (64-bit) floating-point elements in a and
+// b for not-less-than, store the result in the lower element of dst, and copy
+// the upper element from a to the upper element of dst.
+// https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_cmpnlt_sd
+FORCE_INLINE __m128d _mm_cmpnlt_sd(__m128d a, __m128d b)
+{
+    return _mm_move_sd(a, _mm_cmpnlt_pd(a, b));
+}
+
+// Compare packed double-precision (64-bit) floating-point elements in a and b
+// to see if neither is NaN, and store the results in dst.
+// https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_cmpord_pd
+FORCE_INLINE __m128d _mm_cmpord_pd(__m128d a, __m128d b)
+{
+#if defined(__aarch64__) || defined(_M_ARM64)
+    // Excluding NaNs, any two floating point numbers can be compared.
+    uint64x2_t not_nan_a =
+        vceqq_f64(vreinterpretq_f64_m128d(a), vreinterpretq_f64_m128d(a));
+    uint64x2_t not_nan_b =
+        vceqq_f64(vreinterpretq_f64_m128d(b), vreinterpretq_f64_m128d(b));
+    return vreinterpretq_m128d_u64(vandq_u64(not_nan_a, not_nan_b));
+#else
+    double a0 =
+        sse2neon_recast_u64_f64(vgetq_lane_u64(vreinterpretq_u64_m128d(a), 0));
+    double a1 =
+        sse2neon_recast_u64_f64(vgetq_lane_u64(vreinterpretq_u64_m128d(a), 1));
+    double b0 =
+        sse2neon_recast_u64_f64(vgetq_lane_u64(vreinterpretq_u64_m128d(b), 0));
+    double b1 =
+        sse2neon_recast_u64_f64(vgetq_lane_u64(vreinterpretq_u64_m128d(b), 1));
+    uint64_t d[2];
+    d[0] = (a0 == a0 && b0 == b0) ? ~UINT64_C(0) : UINT64_C(0);
+    d[1] = (a1 == a1 && b1 == b1) ? ~UINT64_C(0) : UINT64_C(0);
+
+    return vreinterpretq_m128d_u64(vld1q_u64(d));
+#endif
+}
+
+// Compare the lower double-precision (64-bit) floating-point elements in a and
+// b to see if neither is NaN, store the result in the lower element of dst, and
+// copy the upper element from a to the upper element of dst.
+// https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_cmpord_sd
+FORCE_INLINE __m128d _mm_cmpord_sd(__m128d a, __m128d b)
+{
+#if defined(__aarch64__) || defined(_M_ARM64)
+    return _mm_move_sd(a, _mm_cmpord_pd(a, b));
+#else
+    double a0, b0;
+    a0 = sse2neon_recast_u64_f64(vgetq_lane_u64(vreinterpretq_u64_m128d(a), 0));
+    uint64_t a1 = vgetq_lane_u64(vreinterpretq_u64_m128d(a), 1);
+    b0 = sse2neon_recast_u64_f64(vgetq_lane_u64(vreinterpretq_u64_m128d(b), 0));
+    uint64_t d[2];
+    d[0] = (a0 == a0 && b0 == b0) ? ~UINT64_C(0) : UINT64_C(0);
+    d[1] = a1;
+
+    return vreinterpretq_m128d_u64(vld1q_u64(d));
+#endif
+}
+
+// Compare packed double-precision (64-bit) floating-point elements in a and b
+// to see if either is NaN, and store the results in dst.
+// https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_cmpunord_pd
+FORCE_INLINE __m128d _mm_cmpunord_pd(__m128d a, __m128d b)
+{
+#if defined(__aarch64__) || defined(_M_ARM64)
+    // Two NaNs are not equal in comparison operation.
+    uint64x2_t not_nan_a =
+        vceqq_f64(vreinterpretq_f64_m128d(a), vreinterpretq_f64_m128d(a));
+    uint64x2_t not_nan_b =
+        vceqq_f64(vreinterpretq_f64_m128d(b), vreinterpretq_f64_m128d(b));
+    return vreinterpretq_m128d_s32(
+        vmvnq_s32(vreinterpretq_s32_u64(vandq_u64(not_nan_a, not_nan_b))));
+#else
+    double a0 =
+        sse2neon_recast_u64_f64(vgetq_lane_u64(vreinterpretq_u64_m128d(a), 0));
+    double a1 =
+        sse2neon_recast_u64_f64(vgetq_lane_u64(vreinterpretq_u64_m128d(a), 1));
+    double b0 =
+        sse2neon_recast_u64_f64(vgetq_lane_u64(vreinterpretq_u64_m128d(b), 0));
+    double b1 =
+        sse2neon_recast_u64_f64(vgetq_lane_u64(vreinterpretq_u64_m128d(b), 1));
+    uint64_t d[2];
+    d[0] = (a0 == a0 && b0 == b0) ? UINT64_C(0) : ~UINT64_C(0);
+    d[1] = (a1 == a1 && b1 == b1) ? UINT64_C(0) : ~UINT64_C(0);
+
+    return vreinterpretq_m128d_u64(vld1q_u64(d));
+#endif
+}
+
+// Compare the lower double-precision (64-bit) floating-point elements in a and
+// b to see if either is NaN, store the result in the lower element of dst, and
+// copy the upper element from a to the upper element of dst.
+// https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_cmpunord_sd
+FORCE_INLINE __m128d _mm_cmpunord_sd(__m128d a, __m128d b)
+{
+#if defined(__aarch64__) || defined(_M_ARM64)
+    return _mm_move_sd(a, _mm_cmpunord_pd(a, b));
+#else
+    double a0, b0;
+    a0 = sse2neon_recast_u64_f64(vgetq_lane_u64(vreinterpretq_u64_m128d(a), 0));
+    uint64_t a1 = vgetq_lane_u64(vreinterpretq_u64_m128d(a), 1);
+    b0 = sse2neon_recast_u64_f64(vgetq_lane_u64(vreinterpretq_u64_m128d(b), 0));
+    uint64_t d[2];
+    d[0] = (a0 == a0 && b0 == b0) ? UINT64_C(0) : ~UINT64_C(0);
+    d[1] = a1;
+
+    return vreinterpretq_m128d_u64(vld1q_u64(d));
+#endif
+}
+
+// Compare the lower double-precision (64-bit) floating-point element in a and b
+// for greater-than-or-equal, and return the boolean result (0 or 1).
+// https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_comige_sd
+FORCE_INLINE int _mm_comige_sd(__m128d a, __m128d b)
+{
+#if defined(__aarch64__) || defined(_M_ARM64)
+    return vgetq_lane_u64(vcgeq_f64(a, b), 0) & 0x1;
+#else
+    double a0, b0;
+    a0 = sse2neon_recast_u64_f64(vgetq_lane_u64(vreinterpretq_u64_m128d(a), 0));
+    b0 = sse2neon_recast_u64_f64(vgetq_lane_u64(vreinterpretq_u64_m128d(b), 0));
+    return a0 >= b0;
+#endif
+}
+
+// Compare the lower double-precision (64-bit) floating-point element in a and b
+// for greater-than, and return the boolean result (0 or 1).
+// https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_comigt_sd
+FORCE_INLINE int _mm_comigt_sd(__m128d a, __m128d b)
+{
+#if defined(__aarch64__) || defined(_M_ARM64)
+    return vgetq_lane_u64(vcgtq_f64(a, b), 0) & 0x1;
+#else
+    double a0, b0;
+    a0 = sse2neon_recast_u64_f64(vgetq_lane_u64(vreinterpretq_u64_m128d(a), 0));
+    b0 = sse2neon_recast_u64_f64(vgetq_lane_u64(vreinterpretq_u64_m128d(b), 0));
+
+    return a0 > b0;
+#endif
+}
+
+// Compare the lower double-precision (64-bit) floating-point element in a and b
+// for less-than-or-equal, and return the boolean result (0 or 1).
+// https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_comile_sd
+FORCE_INLINE int _mm_comile_sd(__m128d a, __m128d b)
+{
+#if defined(__aarch64__) || defined(_M_ARM64)
+    return vgetq_lane_u64(vcleq_f64(a, b), 0) & 0x1;
+#else
+    double a0, b0;
+    a0 = sse2neon_recast_u64_f64(vgetq_lane_u64(vreinterpretq_u64_m128d(a), 0));
+    b0 = sse2neon_recast_u64_f64(vgetq_lane_u64(vreinterpretq_u64_m128d(b), 0));
+
+    return a0 <= b0;
+#endif
+}
+
+// Compare the lower double-precision (64-bit) floating-point element in a and b
+// for less-than, and return the boolean result (0 or 1).
+// https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_comilt_sd
+FORCE_INLINE int _mm_comilt_sd(__m128d a, __m128d b)
+{
+#if defined(__aarch64__) || defined(_M_ARM64)
+    return vgetq_lane_u64(vcltq_f64(a, b), 0) & 0x1;
+#else
+    double a0, b0;
+    a0 = sse2neon_recast_u64_f64(vgetq_lane_u64(vreinterpretq_u64_m128d(a), 0));
+    b0 = sse2neon_recast_u64_f64(vgetq_lane_u64(vreinterpretq_u64_m128d(b), 0));
+
+    return a0 < b0;
+#endif
+}
+
+// Compare the lower double-precision (64-bit) floating-point element in a and b
+// for equality, and return the boolean result (0 or 1).
+// https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_comieq_sd
+FORCE_INLINE int _mm_comieq_sd(__m128d a, __m128d b)
+{
+#if defined(__aarch64__) || defined(_M_ARM64)
+    return vgetq_lane_u64(vceqq_f64(a, b), 0) & 0x1;
+#else
+    uint32x4_t a_not_nan =
+        vceqq_u32(vreinterpretq_u32_m128d(a), vreinterpretq_u32_m128d(a));
+    uint32x4_t b_not_nan =
+        vceqq_u32(vreinterpretq_u32_m128d(b), vreinterpretq_u32_m128d(b));
+    uint32x4_t a_and_b_not_nan = vandq_u32(a_not_nan, b_not_nan);
+    uint32x4_t a_eq_b =
+        vceqq_u32(vreinterpretq_u32_m128d(a), vreinterpretq_u32_m128d(b));
+    uint64x2_t and_results = vandq_u64(vreinterpretq_u64_u32(a_and_b_not_nan),
+                                       vreinterpretq_u64_u32(a_eq_b));
+    return vgetq_lane_u64(and_results, 0) & 0x1;
+#endif
+}
+
+// Compare the lower double-precision (64-bit) floating-point element in a and b
+// for not-equal, and return the boolean result (0 or 1).
+// https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_comineq_sd
+FORCE_INLINE int _mm_comineq_sd(__m128d a, __m128d b)
+{
+    return !_mm_comieq_sd(a, b);
+}
+
+// Convert packed signed 32-bit integers in a to packed double-precision
+// (64-bit) floating-point elements, and store the results in dst.
+// https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_cvtepi32_pd
+FORCE_INLINE __m128d _mm_cvtepi32_pd(__m128i a)
+{
+#if defined(__aarch64__) || defined(_M_ARM64)
+    return vreinterpretq_m128d_f64(
+        vcvtq_f64_s64(vmovl_s32(vget_low_s32(vreinterpretq_s32_m128i(a)))));
+#else
+    double a0 = (double) vgetq_lane_s32(vreinterpretq_s32_m128i(a), 0);
+    double a1 = (double) vgetq_lane_s32(vreinterpretq_s32_m128i(a), 1);
+    return _mm_set_pd(a1, a0);
+#endif
+}
+
+// Convert packed signed 32-bit integers in a to packed single-precision
+// (32-bit) floating-point elements, and store the results in dst.
+// https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_cvtepi32_ps
+FORCE_INLINE __m128 _mm_cvtepi32_ps(__m128i a)
+{
+    return vreinterpretq_m128_f32(vcvtq_f32_s32(vreinterpretq_s32_m128i(a)));
+}
+
+// Convert packed double-precision (64-bit) floating-point elements in a to
+// packed 32-bit integers, and store the results in dst.
+// https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_cvtpd_epi32
+FORCE_INLINE __m128i _mm_cvtpd_epi32(__m128d a)
+{
+// vrnd32xq_f64 not supported on clang
+#if defined(__ARM_FEATURE_FRINT) && !defined(__clang__)
+    float64x2_t rounded = vrnd32xq_f64(vreinterpretq_f64_m128d(a));
+    int64x2_t integers = vcvtq_s64_f64(rounded);
+    return vreinterpretq_m128i_s32(
+        vcombine_s32(vmovn_s64(integers), vdup_n_s32(0)));
+#else
+    __m128d rnd = _mm_round_pd(a, _MM_FROUND_CUR_DIRECTION);
+    double d0, d1;
+    d0 = sse2neon_recast_u64_f64(
+        vgetq_lane_u64(vreinterpretq_u64_m128d(rnd), 0));
+    d1 = sse2neon_recast_u64_f64(
+        vgetq_lane_u64(vreinterpretq_u64_m128d(rnd), 1));
+    return _mm_set_epi32(0, 0, (int32_t) d1, (int32_t) d0);
+#endif
+}
+
+// Convert packed double-precision (64-bit) floating-point elements in a to
+// packed 32-bit integers, and store the results in dst.
+// https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_cvtpd_pi32
+FORCE_INLINE __m64 _mm_cvtpd_pi32(__m128d a)
+{
+    __m128d rnd = _mm_round_pd(a, _MM_FROUND_CUR_DIRECTION);
+    double d0, d1;
+    d0 = sse2neon_recast_u64_f64(
+        vgetq_lane_u64(vreinterpretq_u64_m128d(rnd), 0));
+    d1 = sse2neon_recast_u64_f64(
+        vgetq_lane_u64(vreinterpretq_u64_m128d(rnd), 1));
+    int32_t ALIGN_STRUCT(16) data[2] = {(int32_t) d0, (int32_t) d1};
+    return vreinterpret_m64_s32(vld1_s32(data));
+}
+
+// Convert packed double-precision (64-bit) floating-point elements in a to
+// packed single-precision (32-bit) floating-point elements, and store the
+// results in dst.
+// https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_cvtpd_ps
+FORCE_INLINE __m128 _mm_cvtpd_ps(__m128d a)
+{
+#if defined(__aarch64__) || defined(_M_ARM64)
+    float32x2_t tmp = vcvt_f32_f64(vreinterpretq_f64_m128d(a));
+    return vreinterpretq_m128_f32(vcombine_f32(tmp, vdup_n_f32(0)));
+#else
+    double a0, a1;
+    a0 = sse2neon_recast_u64_f64(vgetq_lane_u64(vreinterpretq_u64_m128d(a), 0));
+    a1 = sse2neon_recast_u64_f64(vgetq_lane_u64(vreinterpretq_u64_m128d(a), 1));
+    return _mm_set_ps(0, 0, (float) a1, (float) a0);
+#endif
+}
+
+// Convert packed signed 32-bit integers in a to packed double-precision
+// (64-bit) floating-point elements, and store the results in dst.
+// https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_cvtpi32_pd
+FORCE_INLINE __m128d _mm_cvtpi32_pd(__m64 a)
+{
+#if defined(__aarch64__) || defined(_M_ARM64)
+    return vreinterpretq_m128d_f64(
+        vcvtq_f64_s64(vmovl_s32(vreinterpret_s32_m64(a))));
+#else
+    double a0 = (double) vget_lane_s32(vreinterpret_s32_m64(a), 0);
+    double a1 = (double) vget_lane_s32(vreinterpret_s32_m64(a), 1);
+    return _mm_set_pd(a1, a0);
+#endif
+}
+
+// Convert packed single-precision (32-bit) floating-point elements in a to
+// packed 32-bit integers, and store the results in dst.
+// https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_cvtps_epi32
+// *NOTE*. The default rounding mode on SSE is 'round to even', which ARMv7-A
+// does not support! It is supported on ARMv8-A however.
+FORCE_INLINE __m128i _mm_cvtps_epi32(__m128 a)
+{
+#if defined(__ARM_FEATURE_FRINT)
+    return vreinterpretq_m128i_s32(vcvtq_s32_f32(vrnd32xq_f32(a)));
+#elif (defined(__aarch64__) || defined(_M_ARM64)) || \
+    defined(__ARM_FEATURE_DIRECTED_ROUNDING)
+    switch (_MM_GET_ROUNDING_MODE()) {
+    case _MM_ROUND_NEAREST:
+        return vreinterpretq_m128i_s32(vcvtnq_s32_f32(a));
+    case _MM_ROUND_DOWN:
+        return vreinterpretq_m128i_s32(vcvtmq_s32_f32(a));
+    case _MM_ROUND_UP:
+        return vreinterpretq_m128i_s32(vcvtpq_s32_f32(a));
+    default:  // _MM_ROUND_TOWARD_ZERO
+        return vreinterpretq_m128i_s32(vcvtq_s32_f32(a));
+    }
+#else
+    float *f = (float *) &a;
+    switch (_MM_GET_ROUNDING_MODE()) {
+    case _MM_ROUND_NEAREST: {
+        uint32x4_t signmask = vdupq_n_u32(0x80000000);
+        float32x4_t half = vbslq_f32(signmask, vreinterpretq_f32_m128(a),
+                                     vdupq_n_f32(0.5f)); /* +/- 0.5 */
+        int32x4_t r_normal = vcvtq_s32_f32(vaddq_f32(
+            vreinterpretq_f32_m128(a), half)); /* round to integer: [a + 0.5]*/
+        int32x4_t r_trunc = vcvtq_s32_f32(
+            vreinterpretq_f32_m128(a)); /* truncate to integer: [a] */
+        int32x4_t plusone = vreinterpretq_s32_u32(vshrq_n_u32(
+            vreinterpretq_u32_s32(vnegq_s32(r_trunc)), 31)); /* 1 or 0 */
+        int32x4_t r_even = vbicq_s32(vaddq_s32(r_trunc, plusone),
+                                     vdupq_n_s32(1)); /* ([a] + {0,1}) & ~1 */
+        float32x4_t delta = vsubq_f32(
+            vreinterpretq_f32_m128(a),
+            vcvtq_f32_s32(r_trunc)); /* compute delta: delta = (a - [a]) */
+        uint32x4_t is_delta_half =
+            vceqq_f32(delta, half); /* delta == +/- 0.5 */
+        return vreinterpretq_m128i_s32(
+            vbslq_s32(is_delta_half, r_even, r_normal));
+    }
+    case _MM_ROUND_DOWN:
+        return _mm_set_epi32(floorf(f[3]), floorf(f[2]), floorf(f[1]),
+                             floorf(f[0]));
+    case _MM_ROUND_UP:
+        return _mm_set_epi32(ceilf(f[3]), ceilf(f[2]), ceilf(f[1]),
+                             ceilf(f[0]));
+    default:  // _MM_ROUND_TOWARD_ZERO
+        return _mm_set_epi32((int32_t) f[3], (int32_t) f[2], (int32_t) f[1],
+                             (int32_t) f[0]);
+    }
+#endif
+}
+
+// Convert packed single-precision (32-bit) floating-point elements in a to
+// packed double-precision (64-bit) floating-point elements, and store the
+// results in dst.
+// https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_cvtps_pd
+FORCE_INLINE __m128d _mm_cvtps_pd(__m128 a)
+{
+#if defined(__aarch64__) || defined(_M_ARM64)
+    return vreinterpretq_m128d_f64(
+        vcvt_f64_f32(vget_low_f32(vreinterpretq_f32_m128(a))));
+#else
+    double a0 = (double) vgetq_lane_f32(vreinterpretq_f32_m128(a), 0);
+    double a1 = (double) vgetq_lane_f32(vreinterpretq_f32_m128(a), 1);
+    return _mm_set_pd(a1, a0);
+#endif
+}
+
+// Copy the lower double-precision (64-bit) floating-point element of a to dst.
+// https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_cvtsd_f64
+FORCE_INLINE double _mm_cvtsd_f64(__m128d a)
+{
+#if defined(__aarch64__) || defined(_M_ARM64)
+    return (double) vgetq_lane_f64(vreinterpretq_f64_m128d(a), 0);
+#else
+    double _a =
+        sse2neon_recast_u64_f64(vgetq_lane_u64(vreinterpretq_u64_m128d(a), 0));
+    return _a;
+#endif
+}
+
+// Convert the lower double-precision (64-bit) floating-point element in a to a
+// 32-bit integer, and store the result in dst.
+// https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_cvtsd_si32
+FORCE_INLINE int32_t _mm_cvtsd_si32(__m128d a)
+{
+#if defined(__aarch64__) || defined(_M_ARM64)
+    return (int32_t) vgetq_lane_f64(vrndiq_f64(vreinterpretq_f64_m128d(a)), 0);
+#else
+    __m128d rnd = _mm_round_pd(a, _MM_FROUND_CUR_DIRECTION);
+    double ret = sse2neon_recast_u64_f64(
+        vgetq_lane_u64(vreinterpretq_u64_m128d(rnd), 0));
+    return (int32_t) ret;
+#endif
+}
+
+// Convert the lower double-precision (64-bit) floating-point element in a to a
+// 64-bit integer, and store the result in dst.
+// https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_cvtsd_si64
+FORCE_INLINE int64_t _mm_cvtsd_si64(__m128d a)
+{
+#if defined(__aarch64__) || defined(_M_ARM64)
+    return (int64_t) vgetq_lane_f64(vrndiq_f64(vreinterpretq_f64_m128d(a)), 0);
+#else
+    __m128d rnd = _mm_round_pd(a, _MM_FROUND_CUR_DIRECTION);
+    double ret = sse2neon_recast_u64_f64(
+        vgetq_lane_u64(vreinterpretq_u64_m128d(rnd), 0));
+    return (int64_t) ret;
+#endif
+}
+
+// Convert the lower double-precision (64-bit) floating-point element in a to a
+// 64-bit integer, and store the result in dst.
+// https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_cvtsd_si64x
+#define _mm_cvtsd_si64x _mm_cvtsd_si64
+
+// Convert the lower double-precision (64-bit) floating-point element in b to a
+// single-precision (32-bit) floating-point element, store the result in the
+// lower element of dst, and copy the upper 3 packed elements from a to the
+// upper elements of dst.
+// https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_cvtsd_ss
+FORCE_INLINE __m128 _mm_cvtsd_ss(__m128 a, __m128d b)
+{
+#if defined(__aarch64__) || defined(_M_ARM64)
+    return vreinterpretq_m128_f32(vsetq_lane_f32(
+        vget_lane_f32(vcvt_f32_f64(vreinterpretq_f64_m128d(b)), 0),
+        vreinterpretq_f32_m128(a), 0));
+#else
+    double b0 =
+        sse2neon_recast_u64_f64(vgetq_lane_u64(vreinterpretq_u64_m128d(b), 0));
+    return vreinterpretq_m128_f32(
+        vsetq_lane_f32((float) b0, vreinterpretq_f32_m128(a), 0));
+#endif
+}
+
+// Copy the lower 32-bit integer in a to dst.
+// https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_cvtsi128_si32
+FORCE_INLINE int _mm_cvtsi128_si32(__m128i a)
+{
+    return vgetq_lane_s32(vreinterpretq_s32_m128i(a), 0);
+}
+
+// Copy the lower 64-bit integer in a to dst.
+// https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_cvtsi128_si64
+FORCE_INLINE int64_t _mm_cvtsi128_si64(__m128i a)
+{
+    return vgetq_lane_s64(vreinterpretq_s64_m128i(a), 0);
+}
+
+// Copy the lower 64-bit integer in a to dst.
+// https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_cvtsi128_si64x
+#define _mm_cvtsi128_si64x(a) _mm_cvtsi128_si64(a)
+
+// Convert the signed 32-bit integer b to a double-precision (64-bit)
+// floating-point element, store the result in the lower element of dst, and
+// copy the upper element from a to the upper element of dst.
+// https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_cvtsi32_sd
+FORCE_INLINE __m128d _mm_cvtsi32_sd(__m128d a, int32_t b)
+{
+#if defined(__aarch64__) || defined(_M_ARM64)
+    return vreinterpretq_m128d_f64(
+        vsetq_lane_f64((double) b, vreinterpretq_f64_m128d(a), 0));
+#else
+    int64_t _b = sse2neon_recast_f64_s64((double) b);
+    return vreinterpretq_m128d_s64(
+        vsetq_lane_s64(_b, vreinterpretq_s64_m128d(a), 0));
+#endif
+}
+
+// Copy the lower 64-bit integer in a to dst.
+// https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_cvtsi128_si64x
+#define _mm_cvtsi128_si64x(a) _mm_cvtsi128_si64(a)
+
+// Copy 32-bit integer a to the lower elements of dst, and zero the upper
+// elements of dst.
+// https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_cvtsi32_si128
+FORCE_INLINE __m128i _mm_cvtsi32_si128(int a)
+{
+    return vreinterpretq_m128i_s32(vsetq_lane_s32(a, vdupq_n_s32(0), 0));
+}
+
+// Convert the signed 64-bit integer b to a double-precision (64-bit)
+// floating-point element, store the result in the lower element of dst, and
+// copy the upper element from a to the upper element of dst.
+// https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_cvtsi64_sd
+FORCE_INLINE __m128d _mm_cvtsi64_sd(__m128d a, int64_t b)
+{
+#if defined(__aarch64__) || defined(_M_ARM64)
+    return vreinterpretq_m128d_f64(
+        vsetq_lane_f64((double) b, vreinterpretq_f64_m128d(a), 0));
+#else
+    int64_t _b = sse2neon_recast_f64_s64((double) b);
+    return vreinterpretq_m128d_s64(
+        vsetq_lane_s64(_b, vreinterpretq_s64_m128d(a), 0));
+#endif
+}
+
+// Copy 64-bit integer a to the lower element of dst, and zero the upper
+// element.
+// https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_cvtsi64_si128
+FORCE_INLINE __m128i _mm_cvtsi64_si128(int64_t a)
+{
+    return vreinterpretq_m128i_s64(vsetq_lane_s64(a, vdupq_n_s64(0), 0));
+}
+
+// Copy 64-bit integer a to the lower element of dst, and zero the upper
+// element.
+// https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_cvtsi64x_si128
+#define _mm_cvtsi64x_si128(a) _mm_cvtsi64_si128(a)
+
+// Convert the signed 64-bit integer b to a double-precision (64-bit)
+// floating-point element, store the result in the lower element of dst, and
+// copy the upper element from a to the upper element of dst.
+// https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_cvtsi64x_sd
+#define _mm_cvtsi64x_sd(a, b) _mm_cvtsi64_sd(a, b)
+
+// Convert the lower single-precision (32-bit) floating-point element in b to a
+// double-precision (64-bit) floating-point element, store the result in the
+// lower element of dst, and copy the upper element from a to the upper element
+// of dst.
+// https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_cvtss_sd
+FORCE_INLINE __m128d _mm_cvtss_sd(__m128d a, __m128 b)
+{
+    double d = (double) vgetq_lane_f32(vreinterpretq_f32_m128(b), 0);
+#if defined(__aarch64__) || defined(_M_ARM64)
+    return vreinterpretq_m128d_f64(
+        vsetq_lane_f64(d, vreinterpretq_f64_m128d(a), 0));
+#else
+    return vreinterpretq_m128d_s64(vsetq_lane_s64(
+        sse2neon_recast_f64_s64(d), vreinterpretq_s64_m128d(a), 0));
+#endif
+}
+
+// Convert packed double-precision (64-bit) floating-point elements in a to
+// packed 32-bit integers with truncation, and store the results in dst.
+// https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_cvttpd_epi32
+FORCE_INLINE __m128i _mm_cvttpd_epi32(__m128d a)
+{
+    double a0, a1;
+    a0 = sse2neon_recast_u64_f64(vgetq_lane_u64(vreinterpretq_u64_m128d(a), 0));
+    a1 = sse2neon_recast_u64_f64(vgetq_lane_u64(vreinterpretq_u64_m128d(a), 1));
+    return _mm_set_epi32(0, 0, (int32_t) a1, (int32_t) a0);
+}
+
+// Convert packed double-precision (64-bit) floating-point elements in a to
+// packed 32-bit integers with truncation, and store the results in dst.
+// https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_cvttpd_pi32
+FORCE_INLINE __m64 _mm_cvttpd_pi32(__m128d a)
+{
+    double a0, a1;
+    a0 = sse2neon_recast_u64_f64(vgetq_lane_u64(vreinterpretq_u64_m128d(a), 0));
+    a1 = sse2neon_recast_u64_f64(vgetq_lane_u64(vreinterpretq_u64_m128d(a), 1));
+    int32_t ALIGN_STRUCT(16) data[2] = {(int32_t) a0, (int32_t) a1};
+    return vreinterpret_m64_s32(vld1_s32(data));
+}
+
+// Convert packed single-precision (32-bit) floating-point elements in a to
+// packed 32-bit integers with truncation, and store the results in dst.
+// https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_cvttps_epi32
+FORCE_INLINE __m128i _mm_cvttps_epi32(__m128 a)
+{
+    return vreinterpretq_m128i_s32(vcvtq_s32_f32(vreinterpretq_f32_m128(a)));
+}
+
+// Convert the lower double-precision (64-bit) floating-point element in a to a
+// 32-bit integer with truncation, and store the result in dst.
+// https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_cvttsd_si32
+FORCE_INLINE int32_t _mm_cvttsd_si32(__m128d a)
+{
+    double _a =
+        sse2neon_recast_u64_f64(vgetq_lane_u64(vreinterpretq_u64_m128d(a), 0));
+    return (int32_t) _a;
+}
+
+// Convert the lower double-precision (64-bit) floating-point element in a to a
+// 64-bit integer with truncation, and store the result in dst.
+// https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_cvttsd_si64
+FORCE_INLINE int64_t _mm_cvttsd_si64(__m128d a)
+{
+#if defined(__aarch64__) || defined(_M_ARM64)
+    return vgetq_lane_s64(vcvtq_s64_f64(vreinterpretq_f64_m128d(a)), 0);
+#else
+    double _a =
+        sse2neon_recast_u64_f64(vgetq_lane_u64(vreinterpretq_u64_m128d(a), 0));
+    return (int64_t) _a;
+#endif
+}
+
+// Convert the lower double-precision (64-bit) floating-point element in a to a
+// 64-bit integer with truncation, and store the result in dst.
+// https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_cvttsd_si64x
+#define _mm_cvttsd_si64x(a) _mm_cvttsd_si64(a)
+
+// Divide packed double-precision (64-bit) floating-point elements in a by
+// packed elements in b, and store the results in dst.
+// https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_div_pd
+FORCE_INLINE __m128d _mm_div_pd(__m128d a, __m128d b)
+{
+#if defined(__aarch64__) || defined(_M_ARM64)
+    return vreinterpretq_m128d_f64(
+        vdivq_f64(vreinterpretq_f64_m128d(a), vreinterpretq_f64_m128d(b)));
+#else
+    double a0 =
+        sse2neon_recast_u64_f64(vgetq_lane_u64(vreinterpretq_u64_m128d(a), 0));
+    double a1 =
+        sse2neon_recast_u64_f64(vgetq_lane_u64(vreinterpretq_u64_m128d(a), 1));
+    double b0 =
+        sse2neon_recast_u64_f64(vgetq_lane_u64(vreinterpretq_u64_m128d(b), 0));
+    double b1 =
+        sse2neon_recast_u64_f64(vgetq_lane_u64(vreinterpretq_u64_m128d(b), 1));
+    double c[2];
+    c[0] = a0 / b0;
+    c[1] = a1 / b1;
+    return vld1q_f32((float32_t *) c);
+#endif
+}
+
+// Divide the lower double-precision (64-bit) floating-point element in a by the
+// lower double-precision (64-bit) floating-point element in b, store the result
+// in the lower element of dst, and copy the upper element from a to the upper
+// element of dst.
+// https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_div_sd
+FORCE_INLINE __m128d _mm_div_sd(__m128d a, __m128d b)
+{
+#if defined(__aarch64__) || defined(_M_ARM64)
+    float64x2_t tmp =
+        vdivq_f64(vreinterpretq_f64_m128d(a), vreinterpretq_f64_m128d(b));
+    return vreinterpretq_m128d_f64(
+        vsetq_lane_f64(vgetq_lane_f64(vreinterpretq_f64_m128d(a), 1), tmp, 1));
+#else
+    return _mm_move_sd(a, _mm_div_pd(a, b));
+#endif
+}
+
+// Extract a 16-bit integer from a, selected with imm8, and store the result in
+// the lower element of dst.
+// https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_extract_epi16
+// FORCE_INLINE int _mm_extract_epi16(__m128i a, __constrange(0,8) int imm)
+#define _mm_extract_epi16(a, imm) \
+    vgetq_lane_u16(vreinterpretq_u16_m128i(a), (imm))
+
+// Copy a to dst, and insert the 16-bit integer i into dst at the location
+// specified by imm8.
+// https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_insert_epi16
+// FORCE_INLINE __m128i _mm_insert_epi16(__m128i a, int b,
+//                                       __constrange(0,8) int imm)
+#define _mm_insert_epi16(a, b, imm) \
+    vreinterpretq_m128i_s16(        \
+        vsetq_lane_s16((b), vreinterpretq_s16_m128i(a), (imm)))
+
+// Load 128-bits (composed of 2 packed double-precision (64-bit) floating-point
+// elements) from memory into dst. mem_addr must be aligned on a 16-byte
+// boundary or a general-protection exception may be generated.
+// https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_load_pd
+FORCE_INLINE __m128d _mm_load_pd(const double *p)
+{
+#if defined(__aarch64__) || defined(_M_ARM64)
+    return vreinterpretq_m128d_f64(vld1q_f64(p));
+#else
+    const float *fp = (const float *) p;
+    float ALIGN_STRUCT(16) data[4] = {fp[0], fp[1], fp[2], fp[3]};
+    return vreinterpretq_m128d_f32(vld1q_f32(data));
+#endif
+}
+
+// Load a double-precision (64-bit) floating-point element from memory into both
+// elements of dst.
+// https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_load_pd1
+#define _mm_load_pd1 _mm_load1_pd
+
+// Load a double-precision (64-bit) floating-point element from memory into the
+// lower of dst, and zero the upper element. mem_addr does not need to be
+// aligned on any particular boundary.
+// https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_load_sd
+FORCE_INLINE __m128d _mm_load_sd(const double *p)
+{
+#if defined(__aarch64__) || defined(_M_ARM64)
+    return vreinterpretq_m128d_f64(vsetq_lane_f64(*p, vdupq_n_f64(0), 0));
+#else
+    const float *fp = (const float *) p;
+    float ALIGN_STRUCT(16) data[4] = {fp[0], fp[1], 0, 0};
+    return vreinterpretq_m128d_f32(vld1q_f32(data));
+#endif
+}
+
+// Load 128-bits of integer data from memory into dst. mem_addr must be aligned
+// on a 16-byte boundary or a general-protection exception may be generated.
+// https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_load_si128
+FORCE_INLINE __m128i _mm_load_si128(const __m128i *p)
+{
+    return vreinterpretq_m128i_s32(vld1q_s32((const int32_t *) p));
+}
+
+// Load a double-precision (64-bit) floating-point element from memory into both
+// elements of dst.
+// https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_load1_pd
+FORCE_INLINE __m128d _mm_load1_pd(const double *p)
+{
+#if defined(__aarch64__) || defined(_M_ARM64)
+    return vreinterpretq_m128d_f64(vld1q_dup_f64(p));
+#else
+    return vreinterpretq_m128d_s64(vdupq_n_s64(*(const int64_t *) p));
+#endif
+}
+
+// Load a double-precision (64-bit) floating-point element from memory into the
+// upper element of dst, and copy the lower element from a to dst. mem_addr does
+// not need to be aligned on any particular boundary.
+// https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_loadh_pd
+FORCE_INLINE __m128d _mm_loadh_pd(__m128d a, const double *p)
+{
+#if defined(__aarch64__) || defined(_M_ARM64)
+    return vreinterpretq_m128d_f64(
+        vcombine_f64(vget_low_f64(vreinterpretq_f64_m128d(a)), vld1_f64(p)));
+#else
+    return vreinterpretq_m128d_f32(vcombine_f32(
+        vget_low_f32(vreinterpretq_f32_m128d(a)), vld1_f32((const float *) p)));
+#endif
+}
+
+// Load 64-bit integer from memory into the first element of dst.
+// https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_loadl_epi64
+FORCE_INLINE __m128i _mm_loadl_epi64(__m128i const *p)
+{
+    /* Load the lower 64 bits of the value pointed to by p into the
+     * lower 64 bits of the result, zeroing the upper 64 bits of the result.
+     */
+    return vreinterpretq_m128i_s32(
+        vcombine_s32(vld1_s32((int32_t const *) p), vcreate_s32(0)));
+}
+
+// Load a double-precision (64-bit) floating-point element from memory into the
+// lower element of dst, and copy the upper element from a to dst. mem_addr does
+// not need to be aligned on any particular boundary.
+// https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_loadl_pd
+FORCE_INLINE __m128d _mm_loadl_pd(__m128d a, const double *p)
+{
+#if defined(__aarch64__) || defined(_M_ARM64)
+    return vreinterpretq_m128d_f64(
+        vcombine_f64(vld1_f64(p), vget_high_f64(vreinterpretq_f64_m128d(a))));
+#else
+    return vreinterpretq_m128d_f32(
+        vcombine_f32(vld1_f32((const float *) p),
+                     vget_high_f32(vreinterpretq_f32_m128d(a))));
+#endif
+}
+
+// Load 2 double-precision (64-bit) floating-point elements from memory into dst
+// in reverse order. mem_addr must be aligned on a 16-byte boundary or a
+// general-protection exception may be generated.
+// https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_loadr_pd
+FORCE_INLINE __m128d _mm_loadr_pd(const double *p)
+{
+#if defined(__aarch64__) || defined(_M_ARM64)
+    float64x2_t v = vld1q_f64(p);
+    return vreinterpretq_m128d_f64(vextq_f64(v, v, 1));
+#else
+    int64x2_t v = vld1q_s64((const int64_t *) p);
+    return vreinterpretq_m128d_s64(vextq_s64(v, v, 1));
+#endif
+}
+
+// Loads two double-precision from unaligned memory, floating-point values.
+// https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_loadu_pd
+FORCE_INLINE __m128d _mm_loadu_pd(const double *p)
+{
+    return _mm_load_pd(p);
+}
+
+// Load 128-bits of integer data from memory into dst. mem_addr does not need to
+// be aligned on any particular boundary.
+// https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_loadu_si128
+FORCE_INLINE __m128i _mm_loadu_si128(const __m128i *p)
+{
+    return vreinterpretq_m128i_s32(vld1q_s32((const unaligned_int32_t *) p));
+}
+
+// Load unaligned 32-bit integer from memory into the first element of dst.
+// https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_loadu_si32
+FORCE_INLINE __m128i _mm_loadu_si32(const void *p)
+{
+    return vreinterpretq_m128i_s32(
+        vsetq_lane_s32(*(const unaligned_int32_t *) p, vdupq_n_s32(0), 0));
+}
+
+// Multiply packed signed 16-bit integers in a and b, producing intermediate
+// signed 32-bit integers. Horizontally add adjacent pairs of intermediate
+// 32-bit integers, and pack the results in dst.
+// https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_madd_epi16
+FORCE_INLINE __m128i _mm_madd_epi16(__m128i a, __m128i b)
+{
+    int32x4_t low = vmull_s16(vget_low_s16(vreinterpretq_s16_m128i(a)),
+                              vget_low_s16(vreinterpretq_s16_m128i(b)));
+#if defined(__aarch64__) || defined(_M_ARM64)
+    int32x4_t high =
+        vmull_high_s16(vreinterpretq_s16_m128i(a), vreinterpretq_s16_m128i(b));
+
+    return vreinterpretq_m128i_s32(vpaddq_s32(low, high));
+#else
+    int32x4_t high = vmull_s16(vget_high_s16(vreinterpretq_s16_m128i(a)),
+                               vget_high_s16(vreinterpretq_s16_m128i(b)));
+
+    int32x2_t low_sum = vpadd_s32(vget_low_s32(low), vget_high_s32(low));
+    int32x2_t high_sum = vpadd_s32(vget_low_s32(high), vget_high_s32(high));
+
+    return vreinterpretq_m128i_s32(vcombine_s32(low_sum, high_sum));
+#endif
+}
+
+// Conditionally store 8-bit integer elements from a into memory using mask
+// (elements are not stored when the highest bit is not set in the corresponding
+// element) and a non-temporal memory hint. mem_addr does not need to be aligned
+// on any particular boundary.
+// https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_maskmoveu_si128
+FORCE_INLINE void _mm_maskmoveu_si128(__m128i a, __m128i mask, char *mem_addr)
+{
+    int8x16_t shr_mask = vshrq_n_s8(vreinterpretq_s8_m128i(mask), 7);
+    __m128 b = _mm_load_ps((const float *) mem_addr);
+    int8x16_t masked =
+        vbslq_s8(vreinterpretq_u8_s8(shr_mask), vreinterpretq_s8_m128i(a),
+                 vreinterpretq_s8_m128(b));
+    vst1q_s8((int8_t *) mem_addr, masked);
+}
+
+// Compare packed signed 16-bit integers in a and b, and store packed maximum
+// values in dst.
+// https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_max_epi16
+FORCE_INLINE __m128i _mm_max_epi16(__m128i a, __m128i b)
+{
+    return vreinterpretq_m128i_s16(
+        vmaxq_s16(vreinterpretq_s16_m128i(a), vreinterpretq_s16_m128i(b)));
+}
+
+// Compare packed unsigned 8-bit integers in a and b, and store packed maximum
+// values in dst.
+// https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_max_epu8
+FORCE_INLINE __m128i _mm_max_epu8(__m128i a, __m128i b)
+{
+    return vreinterpretq_m128i_u8(
+        vmaxq_u8(vreinterpretq_u8_m128i(a), vreinterpretq_u8_m128i(b)));
+}
+
+// Compare packed double-precision (64-bit) floating-point elements in a and b,
+// and store packed maximum values in dst.
+// https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_max_pd
+FORCE_INLINE __m128d _mm_max_pd(__m128d a, __m128d b)
+{
+#if defined(__aarch64__) || defined(_M_ARM64)
+#if SSE2NEON_PRECISE_MINMAX
+    float64x2_t _a = vreinterpretq_f64_m128d(a);
+    float64x2_t _b = vreinterpretq_f64_m128d(b);
+    return vreinterpretq_m128d_f64(vbslq_f64(vcgtq_f64(_a, _b), _a, _b));
+#else
+    return vreinterpretq_m128d_f64(
+        vmaxq_f64(vreinterpretq_f64_m128d(a), vreinterpretq_f64_m128d(b)));
+#endif
+#else
+    double a0 =
+        sse2neon_recast_u64_f64(vgetq_lane_u64(vreinterpretq_u64_m128d(a), 0));
+    double a1 =
+        sse2neon_recast_u64_f64(vgetq_lane_u64(vreinterpretq_u64_m128d(a), 1));
+    double b0 =
+        sse2neon_recast_u64_f64(vgetq_lane_u64(vreinterpretq_u64_m128d(b), 0));
+    double b1 =
+        sse2neon_recast_u64_f64(vgetq_lane_u64(vreinterpretq_u64_m128d(b), 1));
+    int64_t d[2];
+    d[0] = a0 > b0 ? sse2neon_recast_f64_s64(a0) : sse2neon_recast_f64_s64(b0);
+    d[1] = a1 > b1 ? sse2neon_recast_f64_s64(a1) : sse2neon_recast_f64_s64(b1);
+
+    return vreinterpretq_m128d_s64(vld1q_s64(d));
+#endif
+}
+
+// Compare the lower double-precision (64-bit) floating-point elements in a and
+// b, store the maximum value in the lower element of dst, and copy the upper
+// element from a to the upper element of dst.
+// https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_max_sd
+FORCE_INLINE __m128d _mm_max_sd(__m128d a, __m128d b)
+{
+#if defined(__aarch64__) || defined(_M_ARM64)
+    return _mm_move_sd(a, _mm_max_pd(a, b));
+#else
+    double a0, a1, b0;
+    a0 = sse2neon_recast_u64_f64(vgetq_lane_u64(vreinterpretq_u64_m128d(a), 0));
+    a1 = sse2neon_recast_u64_f64(vgetq_lane_u64(vreinterpretq_u64_m128d(a), 1));
+    b0 = sse2neon_recast_u64_f64(vgetq_lane_u64(vreinterpretq_u64_m128d(b), 0));
+    double c[2] = {a0 > b0 ? a0 : b0, a1};
+    return vreinterpretq_m128d_f32(vld1q_f32((float32_t *) c));
+#endif
+}
+
+// Compare packed signed 16-bit integers in a and b, and store packed minimum
+// values in dst.
+// https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_min_epi16
+FORCE_INLINE __m128i _mm_min_epi16(__m128i a, __m128i b)
+{
+    return vreinterpretq_m128i_s16(
+        vminq_s16(vreinterpretq_s16_m128i(a), vreinterpretq_s16_m128i(b)));
+}
+
+// Compare packed unsigned 8-bit integers in a and b, and store packed minimum
+// values in dst.
+// https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_min_epu8
+FORCE_INLINE __m128i _mm_min_epu8(__m128i a, __m128i b)
+{
+    return vreinterpretq_m128i_u8(
+        vminq_u8(vreinterpretq_u8_m128i(a), vreinterpretq_u8_m128i(b)));
+}
+
+// Compare packed double-precision (64-bit) floating-point elements in a and b,
+// and store packed minimum values in dst.
+// https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_min_pd
+FORCE_INLINE __m128d _mm_min_pd(__m128d a, __m128d b)
+{
+#if defined(__aarch64__) || defined(_M_ARM64)
+#if SSE2NEON_PRECISE_MINMAX
+    float64x2_t _a = vreinterpretq_f64_m128d(a);
+    float64x2_t _b = vreinterpretq_f64_m128d(b);
+    return vreinterpretq_m128d_f64(vbslq_f64(vcltq_f64(_a, _b), _a, _b));
+#else
+    return vreinterpretq_m128d_f64(
+        vminq_f64(vreinterpretq_f64_m128d(a), vreinterpretq_f64_m128d(b)));
+#endif
+#else
+    double a0 =
+        sse2neon_recast_u64_f64(vgetq_lane_u64(vreinterpretq_u64_m128d(a), 0));
+    double a1 =
+        sse2neon_recast_u64_f64(vgetq_lane_u64(vreinterpretq_u64_m128d(a), 1));
+    double b0 =
+        sse2neon_recast_u64_f64(vgetq_lane_u64(vreinterpretq_u64_m128d(b), 0));
+    double b1 =
+        sse2neon_recast_u64_f64(vgetq_lane_u64(vreinterpretq_u64_m128d(b), 1));
+    int64_t d[2];
+    d[0] = a0 < b0 ? sse2neon_recast_f64_s64(a0) : sse2neon_recast_f64_s64(b0);
+    d[1] = a1 < b1 ? sse2neon_recast_f64_s64(a1) : sse2neon_recast_f64_s64(b1);
+    return vreinterpretq_m128d_s64(vld1q_s64(d));
+#endif
+}
+
+// Compare the lower double-precision (64-bit) floating-point elements in a and
+// b, store the minimum value in the lower element of dst, and copy the upper
+// element from a to the upper element of dst.
+// https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_min_sd
+FORCE_INLINE __m128d _mm_min_sd(__m128d a, __m128d b)
+{
+#if defined(__aarch64__) || defined(_M_ARM64)
+    return _mm_move_sd(a, _mm_min_pd(a, b));
+#else
+    double a0, a1, b0;
+    a0 = sse2neon_recast_u64_f64(vgetq_lane_u64(vreinterpretq_u64_m128d(a), 0));
+    a1 = sse2neon_recast_u64_f64(vgetq_lane_u64(vreinterpretq_u64_m128d(a), 1));
+    b0 = sse2neon_recast_u64_f64(vgetq_lane_u64(vreinterpretq_u64_m128d(b), 0));
+    double c[2] = {a0 < b0 ? a0 : b0, a1};
+    return vreinterpretq_m128d_f32(vld1q_f32((float32_t *) c));
+#endif
+}
+
+// Copy the lower 64-bit integer in a to the lower element of dst, and zero the
+// upper element.
+// https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_move_epi64
+FORCE_INLINE __m128i _mm_move_epi64(__m128i a)
+{
+    return vreinterpretq_m128i_s64(
+        vsetq_lane_s64(0, vreinterpretq_s64_m128i(a), 1));
+}
+
+// Move the lower double-precision (64-bit) floating-point element from b to the
+// lower element of dst, and copy the upper element from a to the upper element
+// of dst.
+// https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_move_sd
+FORCE_INLINE __m128d _mm_move_sd(__m128d a, __m128d b)
+{
+    return vreinterpretq_m128d_f32(
+        vcombine_f32(vget_low_f32(vreinterpretq_f32_m128d(b)),
+                     vget_high_f32(vreinterpretq_f32_m128d(a))));
+}
+
+// Create mask from the most significant bit of each 8-bit element in a, and
+// store the result in dst.
+// https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_movemask_epi8
+FORCE_INLINE int _mm_movemask_epi8(__m128i a)
+{
+    // Use increasingly wide shifts+adds to collect the sign bits
+    // together.
+    // Since the widening shifts would be rather confusing to follow in little
+    // endian, everything will be illustrated in big endian order instead. This
+    // has a different result - the bits would actually be reversed on a big
+    // endian machine.
+
+    // Starting input (only half the elements are shown):
+    // 89 ff 1d c0 00 10 99 33
+    uint8x16_t input = vreinterpretq_u8_m128i(a);
+
+    // Shift out everything but the sign bits with an unsigned shift right.
+    //
+    // Bytes of the vector::
+    // 89 ff 1d c0 00 10 99 33
+    // \  \  \  \  \  \  \  \    high_bits = (uint16x4_t)(input >> 7)
+    //  |  |  |  |  |  |  |  |
+    // 01 01 00 01 00 00 01 00
+    //
+    // Bits of first important lane(s):
+    // 10001001 (89)
+    // \______
+    //        |
+    // 00000001 (01)
+    uint16x8_t high_bits = vreinterpretq_u16_u8(vshrq_n_u8(input, 7));
+
+    // Merge the even lanes together with a 16-bit unsigned shift right + add.
+    // 'xx' represents garbage data which will be ignored in the final result.
+    // In the important bytes, the add functions like a binary OR.
+    //
+    // 01 01 00 01 00 00 01 00
+    //  \_ |  \_ |  \_ |  \_ |   paired16 = (uint32x4_t)(input + (input >> 7))
+    //    \|    \|    \|    \|
+    // xx 03 xx 01 xx 00 xx 02
+    //
+    // 00000001 00000001 (01 01)
+    //        \_______ |
+    //                \|
+    // xxxxxxxx xxxxxx11 (xx 03)
+    uint32x4_t paired16 =
+        vreinterpretq_u32_u16(vsraq_n_u16(high_bits, high_bits, 7));
+
+    // Repeat with a wider 32-bit shift + add.
+    // xx 03 xx 01 xx 00 xx 02
+    //     \____ |     \____ |  paired32 = (uint64x1_t)(paired16 + (paired16 >>
+    //     14))
+    //          \|          \|
+    // xx xx xx 0d xx xx xx 02
+    //
+    // 00000011 00000001 (03 01)
+    //        \\_____ ||
+    //         '----.\||
+    // xxxxxxxx xxxx1101 (xx 0d)
+    uint64x2_t paired32 =
+        vreinterpretq_u64_u32(vsraq_n_u32(paired16, paired16, 14));
+
+    // Last, an even wider 64-bit shift + add to get our result in the low 8 bit
+    // lanes. xx xx xx 0d xx xx xx 02
+    //            \_________ |   paired64 = (uint8x8_t)(paired32 + (paired32 >>
+    //            28))
+    //                      \|
+    // xx xx xx xx xx xx xx d2
+    //
+    // 00001101 00000010 (0d 02)
+    //     \   \___ |  |
+    //      '---.  \|  |
+    // xxxxxxxx 11010010 (xx d2)
+    uint8x16_t paired64 =
+        vreinterpretq_u8_u64(vsraq_n_u64(paired32, paired32, 28));
+
+    // Extract the low 8 bits from each 64-bit lane with 2 8-bit extracts.
+    // xx xx xx xx xx xx xx d2
+    //                      ||  return paired64[0]
+    //                      d2
+    // Note: Little endian would return the correct value 4b (01001011) instead.
+    return vgetq_lane_u8(paired64, 0) | ((int) vgetq_lane_u8(paired64, 8) << 8);
+}
+
+// Set each bit of mask dst based on the most significant bit of the
+// corresponding packed double-precision (64-bit) floating-point element in a.
+// https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_movemask_pd
+FORCE_INLINE int _mm_movemask_pd(__m128d a)
+{
+    uint64x2_t input = vreinterpretq_u64_m128d(a);
+    uint64x2_t high_bits = vshrq_n_u64(input, 63);
+    return (int) (vgetq_lane_u64(high_bits, 0) |
+                  (vgetq_lane_u64(high_bits, 1) << 1));
+}
+
+// Copy the lower 64-bit integer in a to dst.
+// https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_movepi64_pi64
+FORCE_INLINE __m64 _mm_movepi64_pi64(__m128i a)
+{
+    return vreinterpret_m64_s64(vget_low_s64(vreinterpretq_s64_m128i(a)));
+}
+
+// Copy the 64-bit integer a to the lower element of dst, and zero the upper
+// element.
+// https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_movpi64_epi64
+FORCE_INLINE __m128i _mm_movpi64_epi64(__m64 a)
+{
+    return vreinterpretq_m128i_s64(
+        vcombine_s64(vreinterpret_s64_m64(a), vdup_n_s64(0)));
+}
+
+// Multiply the low unsigned 32-bit integers from each packed 64-bit element in
+// a and b, and store the unsigned 64-bit results in dst.
+// https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_mul_epu32
+FORCE_INLINE __m128i _mm_mul_epu32(__m128i a, __m128i b)
+{
+    // vmull_u32 upcasts instead of masking, so we downcast.
+    uint32x2_t a_lo = vmovn_u64(vreinterpretq_u64_m128i(a));
+    uint32x2_t b_lo = vmovn_u64(vreinterpretq_u64_m128i(b));
+    return vreinterpretq_m128i_u64(vmull_u32(a_lo, b_lo));
+}
+
+// Multiply packed double-precision (64-bit) floating-point elements in a and b,
+// and store the results in dst.
+// https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_mul_pd
+FORCE_INLINE __m128d _mm_mul_pd(__m128d a, __m128d b)
+{
+#if defined(__aarch64__) || defined(_M_ARM64)
+    return vreinterpretq_m128d_f64(
+        vmulq_f64(vreinterpretq_f64_m128d(a), vreinterpretq_f64_m128d(b)));
+#else
+    double a0 =
+        sse2neon_recast_u64_f64(vgetq_lane_u64(vreinterpretq_u64_m128d(a), 0));
+    double a1 =
+        sse2neon_recast_u64_f64(vgetq_lane_u64(vreinterpretq_u64_m128d(a), 1));
+    double b0 =
+        sse2neon_recast_u64_f64(vgetq_lane_u64(vreinterpretq_u64_m128d(b), 0));
+    double b1 =
+        sse2neon_recast_u64_f64(vgetq_lane_u64(vreinterpretq_u64_m128d(b), 1));
+    double c[2];
+    c[0] = a0 * b0;
+    c[1] = a1 * b1;
+    return vld1q_f32((float32_t *) c);
+#endif
+}
+
+// Multiply the lower double-precision (64-bit) floating-point element in a and
+// b, store the result in the lower element of dst, and copy the upper element
+// from a to the upper element of dst.
+// https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=mm_mul_sd
+FORCE_INLINE __m128d _mm_mul_sd(__m128d a, __m128d b)
+{
+    return _mm_move_sd(a, _mm_mul_pd(a, b));
+}
+
+// Multiply the low unsigned 32-bit integers from a and b, and store the
+// unsigned 64-bit result in dst.
+// https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_mul_su32
+FORCE_INLINE __m64 _mm_mul_su32(__m64 a, __m64 b)
+{
+    return vreinterpret_m64_u64(vget_low_u64(
+        vmull_u32(vreinterpret_u32_m64(a), vreinterpret_u32_m64(b))));
+}
+
+// Multiply the packed signed 16-bit integers in a and b, producing intermediate
+// 32-bit integers, and store the high 16 bits of the intermediate integers in
+// dst.
+// https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_mulhi_epi16
+FORCE_INLINE __m128i _mm_mulhi_epi16(__m128i a, __m128i b)
+{
+    /* FIXME: issue with large values because of result saturation */
+    // int16x8_t ret = vqdmulhq_s16(vreinterpretq_s16_m128i(a),
+    // vreinterpretq_s16_m128i(b)); /* =2*a*b */ return
+    // vreinterpretq_m128i_s16(vshrq_n_s16(ret, 1));
+    int16x4_t a3210 = vget_low_s16(vreinterpretq_s16_m128i(a));
+    int16x4_t b3210 = vget_low_s16(vreinterpretq_s16_m128i(b));
+    int32x4_t ab3210 = vmull_s16(a3210, b3210); /* 3333222211110000 */
+    int16x4_t a7654 = vget_high_s16(vreinterpretq_s16_m128i(a));
+    int16x4_t b7654 = vget_high_s16(vreinterpretq_s16_m128i(b));
+    int32x4_t ab7654 = vmull_s16(a7654, b7654); /* 7777666655554444 */
+    uint16x8x2_t r =
+        vuzpq_u16(vreinterpretq_u16_s32(ab3210), vreinterpretq_u16_s32(ab7654));
+    return vreinterpretq_m128i_u16(r.val[1]);
+}
+
+// Multiply the packed unsigned 16-bit integers in a and b, producing
+// intermediate 32-bit integers, and store the high 16 bits of the intermediate
+// integers in dst.
+// https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_mulhi_epu16
+FORCE_INLINE __m128i _mm_mulhi_epu16(__m128i a, __m128i b)
+{
+    uint16x4_t a3210 = vget_low_u16(vreinterpretq_u16_m128i(a));
+    uint16x4_t b3210 = vget_low_u16(vreinterpretq_u16_m128i(b));
+    uint32x4_t ab3210 = vmull_u16(a3210, b3210);
+#if defined(__aarch64__) || defined(_M_ARM64)
+    uint32x4_t ab7654 =
+        vmull_high_u16(vreinterpretq_u16_m128i(a), vreinterpretq_u16_m128i(b));
+    uint16x8_t r = vuzp2q_u16(vreinterpretq_u16_u32(ab3210),
+                              vreinterpretq_u16_u32(ab7654));
+    return vreinterpretq_m128i_u16(r);
+#else
+    uint16x4_t a7654 = vget_high_u16(vreinterpretq_u16_m128i(a));
+    uint16x4_t b7654 = vget_high_u16(vreinterpretq_u16_m128i(b));
+    uint32x4_t ab7654 = vmull_u16(a7654, b7654);
+    uint16x8x2_t r =
+        vuzpq_u16(vreinterpretq_u16_u32(ab3210), vreinterpretq_u16_u32(ab7654));
+    return vreinterpretq_m128i_u16(r.val[1]);
+#endif
+}
+
+// Multiply the packed 16-bit integers in a and b, producing intermediate 32-bit
+// integers, and store the low 16 bits of the intermediate integers in dst.
+// https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_mullo_epi16
+FORCE_INLINE __m128i _mm_mullo_epi16(__m128i a, __m128i b)
+{
+    return vreinterpretq_m128i_s16(
+        vmulq_s16(vreinterpretq_s16_m128i(a), vreinterpretq_s16_m128i(b)));
+}
+
+// Compute the bitwise OR of packed double-precision (64-bit) floating-point
+// elements in a and b, and store the results in dst.
+// https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=mm_or_pd
+FORCE_INLINE __m128d _mm_or_pd(__m128d a, __m128d b)
+{
+    return vreinterpretq_m128d_s64(
+        vorrq_s64(vreinterpretq_s64_m128d(a), vreinterpretq_s64_m128d(b)));
+}
+
+// Compute the bitwise OR of 128 bits (representing integer data) in a and b,
+// and store the result in dst.
+// https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_or_si128
+FORCE_INLINE __m128i _mm_or_si128(__m128i a, __m128i b)
+{
+    return vreinterpretq_m128i_s32(
+        vorrq_s32(vreinterpretq_s32_m128i(a), vreinterpretq_s32_m128i(b)));
+}
+
+// Convert packed signed 16-bit integers from a and b to packed 8-bit integers
+// using signed saturation, and store the results in dst.
+// https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_packs_epi16
+FORCE_INLINE __m128i _mm_packs_epi16(__m128i a, __m128i b)
+{
+    return vreinterpretq_m128i_s8(
+        vcombine_s8(vqmovn_s16(vreinterpretq_s16_m128i(a)),
+                    vqmovn_s16(vreinterpretq_s16_m128i(b))));
+}
+
+// Convert packed signed 32-bit integers from a and b to packed 16-bit integers
+// using signed saturation, and store the results in dst.
+// https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_packs_epi32
+FORCE_INLINE __m128i _mm_packs_epi32(__m128i a, __m128i b)
+{
+    return vreinterpretq_m128i_s16(
+        vcombine_s16(vqmovn_s32(vreinterpretq_s32_m128i(a)),
+                     vqmovn_s32(vreinterpretq_s32_m128i(b))));
+}
+
+// Convert packed signed 16-bit integers from a and b to packed 8-bit integers
+// using unsigned saturation, and store the results in dst.
+// https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_packus_epi16
+FORCE_INLINE __m128i _mm_packus_epi16(const __m128i a, const __m128i b)
+{
+    return vreinterpretq_m128i_u8(
+        vcombine_u8(vqmovun_s16(vreinterpretq_s16_m128i(a)),
+                    vqmovun_s16(vreinterpretq_s16_m128i(b))));
+}
+
+// Pause the processor. This is typically used in spin-wait loops and depending
+// on the x86 processor typical values are in the 40-100 cycle range. The
+// 'yield' instruction isn't a good fit because it's effectively a nop on most
+// Arm cores. Experience with several databases has shown has shown an 'isb' is
+// a reasonable approximation.
+// https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_pause
+FORCE_INLINE void _mm_pause(void)
+{
+#if defined(_MSC_VER) && !defined(__clang__)
+    __isb(_ARM64_BARRIER_SY);
+#else
+    __asm__ __volatile__("isb\n");
+#endif
+}
+
+// Compute the absolute differences of packed unsigned 8-bit integers in a and
+// b, then horizontally sum each consecutive 8 differences to produce two
+// unsigned 16-bit integers, and pack these unsigned 16-bit integers in the low
+// 16 bits of 64-bit elements in dst.
+// https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_sad_epu8
+FORCE_INLINE __m128i _mm_sad_epu8(__m128i a, __m128i b)
+{
+    uint16x8_t t = vpaddlq_u8(vabdq_u8((uint8x16_t) a, (uint8x16_t) b));
+    return vreinterpretq_m128i_u64(vpaddlq_u32(vpaddlq_u16(t)));
+}
+
+// Set packed 16-bit integers in dst with the supplied values.
+// https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_set_epi16
+FORCE_INLINE __m128i _mm_set_epi16(short i7,
+                                   short i6,
+                                   short i5,
+                                   short i4,
+                                   short i3,
+                                   short i2,
+                                   short i1,
+                                   short i0)
+{
+    int16_t ALIGN_STRUCT(16) data[8] = {i0, i1, i2, i3, i4, i5, i6, i7};
+    return vreinterpretq_m128i_s16(vld1q_s16(data));
+}
+
+// Set packed 32-bit integers in dst with the supplied values.
+// https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_set_epi32
+FORCE_INLINE __m128i _mm_set_epi32(int i3, int i2, int i1, int i0)
+{
+    int32_t ALIGN_STRUCT(16) data[4] = {i0, i1, i2, i3};
+    return vreinterpretq_m128i_s32(vld1q_s32(data));
+}
+
+// Set packed 64-bit integers in dst with the supplied values.
+// https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_set_epi64
+FORCE_INLINE __m128i _mm_set_epi64(__m64 i1, __m64 i2)
+{
+    return _mm_set_epi64x(vget_lane_s64(i1, 0), vget_lane_s64(i2, 0));
+}
+
+// Set packed 64-bit integers in dst with the supplied values.
+// https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_set_epi64x
+FORCE_INLINE __m128i _mm_set_epi64x(int64_t i1, int64_t i2)
+{
+    return vreinterpretq_m128i_s64(
+        vcombine_s64(vcreate_s64(i2), vcreate_s64(i1)));
+}
+
+// Set packed 8-bit integers in dst with the supplied values.
+// https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_set_epi8
+FORCE_INLINE __m128i _mm_set_epi8(signed char b15,
+                                  signed char b14,
+                                  signed char b13,
+                                  signed char b12,
+                                  signed char b11,
+                                  signed char b10,
+                                  signed char b9,
+                                  signed char b8,
+                                  signed char b7,
+                                  signed char b6,
+                                  signed char b5,
+                                  signed char b4,
+                                  signed char b3,
+                                  signed char b2,
+                                  signed char b1,
+                                  signed char b0)
+{
+    int8_t ALIGN_STRUCT(16)
+        data[16] = {(int8_t) b0,  (int8_t) b1,  (int8_t) b2,  (int8_t) b3,
+                    (int8_t) b4,  (int8_t) b5,  (int8_t) b6,  (int8_t) b7,
+                    (int8_t) b8,  (int8_t) b9,  (int8_t) b10, (int8_t) b11,
+                    (int8_t) b12, (int8_t) b13, (int8_t) b14, (int8_t) b15};
+    return (__m128i) vld1q_s8(data);
+}
+
+// Set packed double-precision (64-bit) floating-point elements in dst with the
+// supplied values.
+// https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_set_pd
+FORCE_INLINE __m128d _mm_set_pd(double e1, double e0)
+{
+    double ALIGN_STRUCT(16) data[2] = {e0, e1};
+#if defined(__aarch64__) || defined(_M_ARM64)
+    return vreinterpretq_m128d_f64(vld1q_f64((float64_t *) data));
+#else
+    return vreinterpretq_m128d_f32(vld1q_f32((float32_t *) data));
+#endif
+}
+
+// Broadcast double-precision (64-bit) floating-point value a to all elements of
+// dst.
+// https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_set_pd1
+#define _mm_set_pd1 _mm_set1_pd
+
+// Copy double-precision (64-bit) floating-point element a to the lower element
+// of dst, and zero the upper element.
+// https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_set_sd
+FORCE_INLINE __m128d _mm_set_sd(double a)
+{
+#if defined(__aarch64__) || defined(_M_ARM64)
+    return vreinterpretq_m128d_f64(vsetq_lane_f64(a, vdupq_n_f64(0), 0));
+#else
+    return _mm_set_pd(0, a);
+#endif
+}
+
+// Broadcast 16-bit integer a to all elements of dst.
+// https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_set1_epi16
+FORCE_INLINE __m128i _mm_set1_epi16(short w)
+{
+    return vreinterpretq_m128i_s16(vdupq_n_s16(w));
+}
+
+// Broadcast 32-bit integer a to all elements of dst.
+// https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_set1_epi32
+FORCE_INLINE __m128i _mm_set1_epi32(int _i)
+{
+    return vreinterpretq_m128i_s32(vdupq_n_s32(_i));
+}
+
+// Broadcast 64-bit integer a to all elements of dst.
+// https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_set1_epi64
+FORCE_INLINE __m128i _mm_set1_epi64(__m64 _i)
+{
+    return vreinterpretq_m128i_s64(vdupq_lane_s64(_i, 0));
+}
+
+// Broadcast 64-bit integer a to all elements of dst.
+// https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_set1_epi64x
+FORCE_INLINE __m128i _mm_set1_epi64x(int64_t _i)
+{
+    return vreinterpretq_m128i_s64(vdupq_n_s64(_i));
+}
+
+// Broadcast 8-bit integer a to all elements of dst.
+// https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_set1_epi8
+FORCE_INLINE __m128i _mm_set1_epi8(signed char w)
+{
+    return vreinterpretq_m128i_s8(vdupq_n_s8(w));
+}
+
+// Broadcast double-precision (64-bit) floating-point value a to all elements of
+// dst.
+// https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_set1_pd
+FORCE_INLINE __m128d _mm_set1_pd(double d)
+{
+#if defined(__aarch64__) || defined(_M_ARM64)
+    return vreinterpretq_m128d_f64(vdupq_n_f64(d));
+#else
+    int64_t _d = sse2neon_recast_f64_s64(d);
+    return vreinterpretq_m128d_s64(vdupq_n_s64(_d));
+#endif
+}
+
+// Set packed 16-bit integers in dst with the supplied values in reverse order.
+// https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_setr_epi16
+FORCE_INLINE __m128i _mm_setr_epi16(short w0,
+                                    short w1,
+                                    short w2,
+                                    short w3,
+                                    short w4,
+                                    short w5,
+                                    short w6,
+                                    short w7)
+{
+    int16_t ALIGN_STRUCT(16) data[8] = {w0, w1, w2, w3, w4, w5, w6, w7};
+    return vreinterpretq_m128i_s16(vld1q_s16((int16_t *) data));
+}
+
+// Set packed 32-bit integers in dst with the supplied values in reverse order.
+// https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_setr_epi32
+FORCE_INLINE __m128i _mm_setr_epi32(int i3, int i2, int i1, int i0)
+{
+    int32_t ALIGN_STRUCT(16) data[4] = {i3, i2, i1, i0};
+    return vreinterpretq_m128i_s32(vld1q_s32(data));
+}
+
+// Set packed 64-bit integers in dst with the supplied values in reverse order.
+// https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_setr_epi64
+FORCE_INLINE __m128i _mm_setr_epi64(__m64 e1, __m64 e0)
+{
+    return vreinterpretq_m128i_s64(vcombine_s64(e1, e0));
+}
+
+// Set packed 8-bit integers in dst with the supplied values in reverse order.
+// https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_setr_epi8
+FORCE_INLINE __m128i _mm_setr_epi8(signed char b0,
+                                   signed char b1,
+                                   signed char b2,
+                                   signed char b3,
+                                   signed char b4,
+                                   signed char b5,
+                                   signed char b6,
+                                   signed char b7,
+                                   signed char b8,
+                                   signed char b9,
+                                   signed char b10,
+                                   signed char b11,
+                                   signed char b12,
+                                   signed char b13,
+                                   signed char b14,
+                                   signed char b15)
+{
+    int8_t ALIGN_STRUCT(16)
+        data[16] = {(int8_t) b0,  (int8_t) b1,  (int8_t) b2,  (int8_t) b3,
+                    (int8_t) b4,  (int8_t) b5,  (int8_t) b6,  (int8_t) b7,
+                    (int8_t) b8,  (int8_t) b9,  (int8_t) b10, (int8_t) b11,
+                    (int8_t) b12, (int8_t) b13, (int8_t) b14, (int8_t) b15};
+    return (__m128i) vld1q_s8(data);
+}
+
+// Set packed double-precision (64-bit) floating-point elements in dst with the
+// supplied values in reverse order.
+// https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_setr_pd
+FORCE_INLINE __m128d _mm_setr_pd(double e1, double e0)
+{
+    return _mm_set_pd(e0, e1);
+}
+
+// Return vector of type __m128d with all elements set to zero.
+// https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_setzero_pd
+FORCE_INLINE __m128d _mm_setzero_pd(void)
+{
+#if defined(__aarch64__) || defined(_M_ARM64)
+    return vreinterpretq_m128d_f64(vdupq_n_f64(0));
+#else
+    return vreinterpretq_m128d_f32(vdupq_n_f32(0));
+#endif
+}
+
+// Return vector of type __m128i with all elements set to zero.
+// https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_setzero_si128
+FORCE_INLINE __m128i _mm_setzero_si128(void)
+{
+    return vreinterpretq_m128i_s32(vdupq_n_s32(0));
+}
+
+// Shuffle 32-bit integers in a using the control in imm8, and store the results
+// in dst.
+// https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_shuffle_epi32
+// FORCE_INLINE __m128i _mm_shuffle_epi32(__m128i a,
+//                                        __constrange(0,255) int imm)
+#if defined(_sse2neon_shuffle)
+#define _mm_shuffle_epi32(a, imm)                                            \
+    __extension__({                                                          \
+        int32x4_t _input = vreinterpretq_s32_m128i(a);                       \
+        int32x4_t _shuf =                                                    \
+            vshuffleq_s32(_input, _input, (imm) & (0x3), ((imm) >> 2) & 0x3, \
+                          ((imm) >> 4) & 0x3, ((imm) >> 6) & 0x3);           \
+        vreinterpretq_m128i_s32(_shuf);                                      \
+    })
+#else  // generic
+#define _mm_shuffle_epi32(a, imm)                           \
+    _sse2neon_define1(                                      \
+        __m128i, a, __m128i ret; switch (imm) {             \
+            case _MM_SHUFFLE(1, 0, 3, 2):                   \
+                ret = _mm_shuffle_epi_1032(_a);             \
+                break;                                      \
+            case _MM_SHUFFLE(2, 3, 0, 1):                   \
+                ret = _mm_shuffle_epi_2301(_a);             \
+                break;                                      \
+            case _MM_SHUFFLE(0, 3, 2, 1):                   \
+                ret = _mm_shuffle_epi_0321(_a);             \
+                break;                                      \
+            case _MM_SHUFFLE(2, 1, 0, 3):                   \
+                ret = _mm_shuffle_epi_2103(_a);             \
+                break;                                      \
+            case _MM_SHUFFLE(1, 0, 1, 0):                   \
+                ret = _mm_shuffle_epi_1010(_a);             \
+                break;                                      \
+            case _MM_SHUFFLE(1, 0, 0, 1):                   \
+                ret = _mm_shuffle_epi_1001(_a);             \
+                break;                                      \
+            case _MM_SHUFFLE(0, 1, 0, 1):                   \
+                ret = _mm_shuffle_epi_0101(_a);             \
+                break;                                      \
+            case _MM_SHUFFLE(2, 2, 1, 1):                   \
+                ret = _mm_shuffle_epi_2211(_a);             \
+                break;                                      \
+            case _MM_SHUFFLE(0, 1, 2, 2):                   \
+                ret = _mm_shuffle_epi_0122(_a);             \
+                break;                                      \
+            case _MM_SHUFFLE(3, 3, 3, 2):                   \
+                ret = _mm_shuffle_epi_3332(_a);             \
+                break;                                      \
+            case _MM_SHUFFLE(0, 0, 0, 0):                   \
+                ret = _mm_shuffle_epi32_splat(_a, 0);       \
+                break;                                      \
+            case _MM_SHUFFLE(1, 1, 1, 1):                   \
+                ret = _mm_shuffle_epi32_splat(_a, 1);       \
+                break;                                      \
+            case _MM_SHUFFLE(2, 2, 2, 2):                   \
+                ret = _mm_shuffle_epi32_splat(_a, 2);       \
+                break;                                      \
+            case _MM_SHUFFLE(3, 3, 3, 3):                   \
+                ret = _mm_shuffle_epi32_splat(_a, 3);       \
+                break;                                      \
+            default:                                        \
+                ret = _mm_shuffle_epi32_default(_a, (imm)); \
+                break;                                      \
+        } _sse2neon_return(ret);)
+#endif
+
+// Shuffle double-precision (64-bit) floating-point elements using the control
+// in imm8, and store the results in dst.
+// https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_shuffle_pd
+#ifdef _sse2neon_shuffle
+#define _mm_shuffle_pd(a, b, imm8)                                            \
+    vreinterpretq_m128d_s64(                                                  \
+        vshuffleq_s64(vreinterpretq_s64_m128d(a), vreinterpretq_s64_m128d(b), \
+                      imm8 & 0x1, ((imm8 & 0x2) >> 1) + 2))
+#else
+#define _mm_shuffle_pd(a, b, imm8)                                     \
+    _mm_castsi128_pd(_mm_set_epi64x(                                   \
+        vgetq_lane_s64(vreinterpretq_s64_m128d(b), (imm8 & 0x2) >> 1), \
+        vgetq_lane_s64(vreinterpretq_s64_m128d(a), imm8 & 0x1)))
+#endif
+
+// FORCE_INLINE __m128i _mm_shufflehi_epi16(__m128i a,
+//                                          __constrange(0,255) int imm)
+#if defined(_sse2neon_shuffle)
+#define _mm_shufflehi_epi16(a, imm)                                           \
+    __extension__({                                                           \
+        int16x8_t _input = vreinterpretq_s16_m128i(a);                        \
+        int16x8_t _shuf =                                                     \
+            vshuffleq_s16(_input, _input, 0, 1, 2, 3, ((imm) & (0x3)) + 4,    \
+                          (((imm) >> 2) & 0x3) + 4, (((imm) >> 4) & 0x3) + 4, \
+                          (((imm) >> 6) & 0x3) + 4);                          \
+        vreinterpretq_m128i_s16(_shuf);                                       \
+    })
+#else  // generic
+#define _mm_shufflehi_epi16(a, imm) _mm_shufflehi_epi16_function((a), (imm))
+#endif
+
+// FORCE_INLINE __m128i _mm_shufflelo_epi16(__m128i a,
+//                                          __constrange(0,255) int imm)
+#if defined(_sse2neon_shuffle)
+#define _mm_shufflelo_epi16(a, imm)                                  \
+    __extension__({                                                  \
+        int16x8_t _input = vreinterpretq_s16_m128i(a);               \
+        int16x8_t _shuf = vshuffleq_s16(                             \
+            _input, _input, ((imm) & (0x3)), (((imm) >> 2) & 0x3),   \
+            (((imm) >> 4) & 0x3), (((imm) >> 6) & 0x3), 4, 5, 6, 7); \
+        vreinterpretq_m128i_s16(_shuf);                              \
+    })
+#else  // generic
+#define _mm_shufflelo_epi16(a, imm) _mm_shufflelo_epi16_function((a), (imm))
+#endif
+
+// Shift packed 16-bit integers in a left by count while shifting in zeros, and
+// store the results in dst.
+// https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_sll_epi16
+FORCE_INLINE __m128i _mm_sll_epi16(__m128i a, __m128i count)
+{
+    uint64_t c = vreinterpretq_nth_u64_m128i(count, 0);
+    if (_sse2neon_unlikely(c & ~15))
+        return _mm_setzero_si128();
+
+    int16x8_t vc = vdupq_n_s16((int16_t) c);
+    return vreinterpretq_m128i_s16(vshlq_s16(vreinterpretq_s16_m128i(a), vc));
+}
+
+// Shift packed 32-bit integers in a left by count while shifting in zeros, and
+// store the results in dst.
+// https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_sll_epi32
+FORCE_INLINE __m128i _mm_sll_epi32(__m128i a, __m128i count)
+{
+    uint64_t c = vreinterpretq_nth_u64_m128i(count, 0);
+    if (_sse2neon_unlikely(c & ~31))
+        return _mm_setzero_si128();
+
+    int32x4_t vc = vdupq_n_s32((int32_t) c);
+    return vreinterpretq_m128i_s32(vshlq_s32(vreinterpretq_s32_m128i(a), vc));
+}
+
+// Shift packed 64-bit integers in a left by count while shifting in zeros, and
+// store the results in dst.
+// https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_sll_epi64
+FORCE_INLINE __m128i _mm_sll_epi64(__m128i a, __m128i count)
+{
+    uint64_t c = vreinterpretq_nth_u64_m128i(count, 0);
+    if (_sse2neon_unlikely(c & ~63))
+        return _mm_setzero_si128();
+
+    int64x2_t vc = vdupq_n_s64((int64_t) c);
+    return vreinterpretq_m128i_s64(vshlq_s64(vreinterpretq_s64_m128i(a), vc));
+}
+
+// Shift packed 16-bit integers in a left by imm8 while shifting in zeros, and
+// store the results in dst.
+// https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_slli_epi16
+FORCE_INLINE __m128i _mm_slli_epi16(__m128i a, int imm)
+{
+    if (_sse2neon_unlikely(imm & ~15))
+        return _mm_setzero_si128();
+    return vreinterpretq_m128i_s16(
+        vshlq_s16(vreinterpretq_s16_m128i(a), vdupq_n_s16(imm)));
+}
+
+// Shift packed 32-bit integers in a left by imm8 while shifting in zeros, and
+// store the results in dst.
+// https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_slli_epi32
+FORCE_INLINE __m128i _mm_slli_epi32(__m128i a, int imm)
+{
+    if (_sse2neon_unlikely(imm & ~31))
+        return _mm_setzero_si128();
+    return vreinterpretq_m128i_s32(
+        vshlq_s32(vreinterpretq_s32_m128i(a), vdupq_n_s32(imm)));
+}
+
+// Shift packed 64-bit integers in a left by imm8 while shifting in zeros, and
+// store the results in dst.
+// https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_slli_epi64
+FORCE_INLINE __m128i _mm_slli_epi64(__m128i a, int imm)
+{
+    if (_sse2neon_unlikely(imm & ~63))
+        return _mm_setzero_si128();
+    return vreinterpretq_m128i_s64(
+        vshlq_s64(vreinterpretq_s64_m128i(a), vdupq_n_s64(imm)));
+}
+
+// Shift a left by imm8 bytes while shifting in zeros, and store the results in
+// dst.
+// https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_slli_si128
+#define _mm_slli_si128(a, imm)                                              \
+    _sse2neon_define1(                                                      \
+        __m128i, a, int8x16_t ret;                                          \
+        if (_sse2neon_unlikely(imm == 0)) ret = vreinterpretq_s8_m128i(_a); \
+        else if (_sse2neon_unlikely((imm) & ~15)) ret = vdupq_n_s8(0);      \
+        else ret = vextq_s8(vdupq_n_s8(0), vreinterpretq_s8_m128i(_a),      \
+                            ((imm <= 0 || imm > 15) ? 0 : (16 - imm)));     \
+        _sse2neon_return(vreinterpretq_m128i_s8(ret));)
+
+// Compute the square root of packed double-precision (64-bit) floating-point
+// elements in a, and store the results in dst.
+// https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_sqrt_pd
+FORCE_INLINE __m128d _mm_sqrt_pd(__m128d a)
+{
+#if defined(__aarch64__) || defined(_M_ARM64)
+    return vreinterpretq_m128d_f64(vsqrtq_f64(vreinterpretq_f64_m128d(a)));
+#else
+    double a0, a1;
+    a0 = sse2neon_recast_u64_f64(vgetq_lane_u64(vreinterpretq_u64_m128d(a), 0));
+    a1 = sse2neon_recast_u64_f64(vgetq_lane_u64(vreinterpretq_u64_m128d(a), 1));
+    double _a0 = sqrt(a0);
+    double _a1 = sqrt(a1);
+    return _mm_set_pd(_a1, _a0);
+#endif
+}
+
+// Compute the square root of the lower double-precision (64-bit) floating-point
+// element in b, store the result in the lower element of dst, and copy the
+// upper element from a to the upper element of dst.
+// https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_sqrt_sd
+FORCE_INLINE __m128d _mm_sqrt_sd(__m128d a, __m128d b)
+{
+#if defined(__aarch64__) || defined(_M_ARM64)
+    return _mm_move_sd(a, _mm_sqrt_pd(b));
+#else
+    double _a, _b;
+    _a = sse2neon_recast_u64_f64(vgetq_lane_u64(vreinterpretq_u64_m128d(a), 1));
+    _b = sse2neon_recast_u64_f64(vgetq_lane_u64(vreinterpretq_u64_m128d(b), 0));
+    return _mm_set_pd(_a, sqrt(_b));
+#endif
+}
+
+// Shift packed 16-bit integers in a right by count while shifting in sign bits,
+// and store the results in dst.
+// https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_sra_epi16
+FORCE_INLINE __m128i _mm_sra_epi16(__m128i a, __m128i count)
+{
+    int64_t c = vgetq_lane_s64(count, 0);
+    if (_sse2neon_unlikely(c & ~15))
+        return _mm_cmplt_epi16(a, _mm_setzero_si128());
+    return vreinterpretq_m128i_s16(
+        vshlq_s16((int16x8_t) a, vdupq_n_s16((int) -c)));
+}
+
+// Shift packed 32-bit integers in a right by count while shifting in sign bits,
+// and store the results in dst.
+// https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_sra_epi32
+FORCE_INLINE __m128i _mm_sra_epi32(__m128i a, __m128i count)
+{
+    int64_t c = vgetq_lane_s64(count, 0);
+    if (_sse2neon_unlikely(c & ~31))
+        return _mm_cmplt_epi32(a, _mm_setzero_si128());
+    return vreinterpretq_m128i_s32(
+        vshlq_s32((int32x4_t) a, vdupq_n_s32((int) -c)));
+}
+
+// Shift packed 16-bit integers in a right by imm8 while shifting in sign
+// bits, and store the results in dst.
+// https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_srai_epi16
+FORCE_INLINE __m128i _mm_srai_epi16(__m128i a, int imm)
+{
+    const int count = (imm & ~15) ? 15 : imm;
+    return (__m128i) vshlq_s16((int16x8_t) a, vdupq_n_s16(-count));
+}
+
+// Shift packed 32-bit integers in a right by imm8 while shifting in sign bits,
+// and store the results in dst.
+// https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_srai_epi32
+// FORCE_INLINE __m128i _mm_srai_epi32(__m128i a, __constrange(0,255) int imm)
+#define _mm_srai_epi32(a, imm)                                                \
+    _sse2neon_define0(                                                        \
+        __m128i, a, __m128i ret; if (_sse2neon_unlikely((imm) == 0)) {        \
+            ret = _a;                                                         \
+        } else if (_sse2neon_likely(0 < (imm) && (imm) < 32)) {               \
+            ret = vreinterpretq_m128i_s32(                                    \
+                vshlq_s32(vreinterpretq_s32_m128i(_a), vdupq_n_s32(-(imm)))); \
+        } else {                                                              \
+            ret = vreinterpretq_m128i_s32(                                    \
+                vshrq_n_s32(vreinterpretq_s32_m128i(_a), 31));                \
+        } _sse2neon_return(ret);)
+
+// Shift packed 16-bit integers in a right by count while shifting in zeros, and
+// store the results in dst.
+// https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_srl_epi16
+FORCE_INLINE __m128i _mm_srl_epi16(__m128i a, __m128i count)
+{
+    uint64_t c = vreinterpretq_nth_u64_m128i(count, 0);
+    if (_sse2neon_unlikely(c & ~15))
+        return _mm_setzero_si128();
+
+    int16x8_t vc = vdupq_n_s16(-(int16_t) c);
+    return vreinterpretq_m128i_u16(vshlq_u16(vreinterpretq_u16_m128i(a), vc));
+}
+
+// Shift packed 32-bit integers in a right by count while shifting in zeros, and
+// store the results in dst.
+// https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_srl_epi32
+FORCE_INLINE __m128i _mm_srl_epi32(__m128i a, __m128i count)
+{
+    uint64_t c = vreinterpretq_nth_u64_m128i(count, 0);
+    if (_sse2neon_unlikely(c & ~31))
+        return _mm_setzero_si128();
+
+    int32x4_t vc = vdupq_n_s32(-(int32_t) c);
+    return vreinterpretq_m128i_u32(vshlq_u32(vreinterpretq_u32_m128i(a), vc));
+}
+
+// Shift packed 64-bit integers in a right by count while shifting in zeros, and
+// store the results in dst.
+// https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_srl_epi64
+FORCE_INLINE __m128i _mm_srl_epi64(__m128i a, __m128i count)
+{
+    uint64_t c = vreinterpretq_nth_u64_m128i(count, 0);
+    if (_sse2neon_unlikely(c & ~63))
+        return _mm_setzero_si128();
+
+    int64x2_t vc = vdupq_n_s64(-(int64_t) c);
+    return vreinterpretq_m128i_u64(vshlq_u64(vreinterpretq_u64_m128i(a), vc));
+}
+
+// Shift packed 16-bit integers in a right by imm8 while shifting in zeros, and
+// store the results in dst.
+// https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_srli_epi16
+#define _mm_srli_epi16(a, imm)                                                \
+    _sse2neon_define0(                                                        \
+        __m128i, a, __m128i ret; if (_sse2neon_unlikely((imm) & ~15)) {       \
+            ret = _mm_setzero_si128();                                        \
+        } else {                                                              \
+            ret = vreinterpretq_m128i_u16(                                    \
+                vshlq_u16(vreinterpretq_u16_m128i(_a), vdupq_n_s16(-(imm)))); \
+        } _sse2neon_return(ret);)
+
+// Shift packed 32-bit integers in a right by imm8 while shifting in zeros, and
+// store the results in dst.
+// https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_srli_epi32
+// FORCE_INLINE __m128i _mm_srli_epi32(__m128i a, __constrange(0,255) int imm)
+#define _mm_srli_epi32(a, imm)                                                \
+    _sse2neon_define0(                                                        \
+        __m128i, a, __m128i ret; if (_sse2neon_unlikely((imm) & ~31)) {       \
+            ret = _mm_setzero_si128();                                        \
+        } else {                                                              \
+            ret = vreinterpretq_m128i_u32(                                    \
+                vshlq_u32(vreinterpretq_u32_m128i(_a), vdupq_n_s32(-(imm)))); \
+        } _sse2neon_return(ret);)
+
+// Shift packed 64-bit integers in a right by imm8 while shifting in zeros, and
+// store the results in dst.
+// https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_srli_epi64
+#define _mm_srli_epi64(a, imm)                                                \
+    _sse2neon_define0(                                                        \
+        __m128i, a, __m128i ret; if (_sse2neon_unlikely((imm) & ~63)) {       \
+            ret = _mm_setzero_si128();                                        \
+        } else {                                                              \
+            ret = vreinterpretq_m128i_u64(                                    \
+                vshlq_u64(vreinterpretq_u64_m128i(_a), vdupq_n_s64(-(imm)))); \
+        } _sse2neon_return(ret);)
+
+// Shift a right by imm8 bytes while shifting in zeros, and store the results in
+// dst.
+// https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_srli_si128
+#define _mm_srli_si128(a, imm)                                         \
+    _sse2neon_define1(                                                 \
+        __m128i, a, int8x16_t ret;                                     \
+        if (_sse2neon_unlikely((imm) & ~15)) ret = vdupq_n_s8(0);      \
+        else ret = vextq_s8(vreinterpretq_s8_m128i(_a), vdupq_n_s8(0), \
+                            (imm > 15 ? 0 : imm));                     \
+        _sse2neon_return(vreinterpretq_m128i_s8(ret));)
+
+// Store 128-bits (composed of 2 packed double-precision (64-bit) floating-point
+// elements) from a into memory. mem_addr must be aligned on a 16-byte boundary
+// or a general-protection exception may be generated.
+// https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_store_pd
+FORCE_INLINE void _mm_store_pd(double *mem_addr, __m128d a)
+{
+#if defined(__aarch64__) || defined(_M_ARM64)
+    vst1q_f64((float64_t *) mem_addr, vreinterpretq_f64_m128d(a));
+#else
+    vst1q_f32((float32_t *) mem_addr, vreinterpretq_f32_m128d(a));
+#endif
+}
+
+// Store the lower double-precision (64-bit) floating-point element from a into
+// 2 contiguous elements in memory. mem_addr must be aligned on a 16-byte
+// boundary or a general-protection exception may be generated.
+// https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_store_pd1
+FORCE_INLINE void _mm_store_pd1(double *mem_addr, __m128d a)
+{
+#if defined(__aarch64__) || defined(_M_ARM64)
+    float64x1_t a_low = vget_low_f64(vreinterpretq_f64_m128d(a));
+    vst1q_f64((float64_t *) mem_addr,
+              vreinterpretq_f64_m128d(vcombine_f64(a_low, a_low)));
+#else
+    float32x2_t a_low = vget_low_f32(vreinterpretq_f32_m128d(a));
+    vst1q_f32((float32_t *) mem_addr,
+              vreinterpretq_f32_m128d(vcombine_f32(a_low, a_low)));
+#endif
+}
+
+// Store the lower double-precision (64-bit) floating-point element from a into
+// memory. mem_addr does not need to be aligned on any particular boundary.
+// https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=mm_store_sd
+FORCE_INLINE void _mm_store_sd(double *mem_addr, __m128d a)
+{
+#if defined(__aarch64__) || defined(_M_ARM64)
+    vst1_f64((float64_t *) mem_addr, vget_low_f64(vreinterpretq_f64_m128d(a)));
+#else
+    vst1_u64((uint64_t *) mem_addr, vget_low_u64(vreinterpretq_u64_m128d(a)));
+#endif
+}
+
+// Store 128-bits of integer data from a into memory. mem_addr must be aligned
+// on a 16-byte boundary or a general-protection exception may be generated.
+// https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_store_si128
+FORCE_INLINE void _mm_store_si128(__m128i *p, __m128i a)
+{
+    vst1q_s32((int32_t *) p, vreinterpretq_s32_m128i(a));
+}
+
+// Store the lower double-precision (64-bit) floating-point element from a into
+// 2 contiguous elements in memory. mem_addr must be aligned on a 16-byte
+// boundary or a general-protection exception may be generated.
+// https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#expand=9,526,5601&text=_mm_store1_pd
+#define _mm_store1_pd _mm_store_pd1
+
+// Store the upper double-precision (64-bit) floating-point element from a into
+// memory.
+// https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_storeh_pd
+FORCE_INLINE void _mm_storeh_pd(double *mem_addr, __m128d a)
+{
+#if defined(__aarch64__) || defined(_M_ARM64)
+    vst1_f64((float64_t *) mem_addr, vget_high_f64(vreinterpretq_f64_m128d(a)));
+#else
+    vst1_f32((float32_t *) mem_addr, vget_high_f32(vreinterpretq_f32_m128d(a)));
+#endif
+}
+
+// Store 64-bit integer from the first element of a into memory.
+// https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_storel_epi64
+FORCE_INLINE void _mm_storel_epi64(__m128i *a, __m128i b)
+{
+    vst1_u64((uint64_t *) a, vget_low_u64(vreinterpretq_u64_m128i(b)));
+}
+
+// Store the lower double-precision (64-bit) floating-point element from a into
+// memory.
+// https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_storel_pd
+FORCE_INLINE void _mm_storel_pd(double *mem_addr, __m128d a)
+{
+#if defined(__aarch64__) || defined(_M_ARM64)
+    vst1_f64((float64_t *) mem_addr, vget_low_f64(vreinterpretq_f64_m128d(a)));
+#else
+    vst1_f32((float32_t *) mem_addr, vget_low_f32(vreinterpretq_f32_m128d(a)));
+#endif
+}
+
+// Store 2 double-precision (64-bit) floating-point elements from a into memory
+// in reverse order. mem_addr must be aligned on a 16-byte boundary or a
+// general-protection exception may be generated.
+// https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_storer_pd
+FORCE_INLINE void _mm_storer_pd(double *mem_addr, __m128d a)
+{
+    float32x4_t f = vreinterpretq_f32_m128d(a);
+    _mm_store_pd(mem_addr, vreinterpretq_m128d_f32(vextq_f32(f, f, 2)));
+}
+
+// Store 128-bits (composed of 2 packed double-precision (64-bit) floating-point
+// elements) from a into memory. mem_addr does not need to be aligned on any
+// particular boundary.
+// https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_storeu_pd
+FORCE_INLINE void _mm_storeu_pd(double *mem_addr, __m128d a)
+{
+    _mm_store_pd(mem_addr, a);
+}
+
+// Store 128-bits of integer data from a into memory. mem_addr does not need to
+// be aligned on any particular boundary.
+// https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_storeu_si128
+FORCE_INLINE void _mm_storeu_si128(__m128i *p, __m128i a)
+{
+    vst1q_s32((int32_t *) p, vreinterpretq_s32_m128i(a));
+}
+
+// Store 32-bit integer from the first element of a into memory. mem_addr does
+// not need to be aligned on any particular boundary.
+// https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_storeu_si32
+FORCE_INLINE void _mm_storeu_si32(void *p, __m128i a)
+{
+    vst1q_lane_s32((int32_t *) p, vreinterpretq_s32_m128i(a), 0);
+}
+
+// Store 128-bits (composed of 2 packed double-precision (64-bit) floating-point
+// elements) from a into memory using a non-temporal memory hint. mem_addr must
+// be aligned on a 16-byte boundary or a general-protection exception may be
+// generated.
+// https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_stream_pd
+FORCE_INLINE void _mm_stream_pd(double *p, __m128d a)
+{
+#if __has_builtin(__builtin_nontemporal_store)
+    __builtin_nontemporal_store(a, (__m128d *) p);
+#elif defined(__aarch64__) || defined(_M_ARM64)
+    vst1q_f64(p, vreinterpretq_f64_m128d(a));
+#else
+    vst1q_s64((int64_t *) p, vreinterpretq_s64_m128d(a));
+#endif
+}
+
+// Store 128-bits of integer data from a into memory using a non-temporal memory
+// hint. mem_addr must be aligned on a 16-byte boundary or a general-protection
+// exception may be generated.
+// https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_stream_si128
+FORCE_INLINE void _mm_stream_si128(__m128i *p, __m128i a)
+{
+#if __has_builtin(__builtin_nontemporal_store)
+    __builtin_nontemporal_store(a, p);
+#else
+    vst1q_s64((int64_t *) p, vreinterpretq_s64_m128i(a));
+#endif
+}
+
+// Store 32-bit integer a into memory using a non-temporal hint to minimize
+// cache pollution. If the cache line containing address mem_addr is already in
+// the cache, the cache will be updated.
+// https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_stream_si32
+FORCE_INLINE void _mm_stream_si32(int *p, int a)
+{
+    vst1q_lane_s32((int32_t *) p, vdupq_n_s32(a), 0);
+}
+
+// Store 64-bit integer a into memory using a non-temporal hint to minimize
+// cache pollution. If the cache line containing address mem_addr is already in
+// the cache, the cache will be updated.
+// https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_stream_si64
+FORCE_INLINE void _mm_stream_si64(__int64 *p, __int64 a)
+{
+    vst1_s64((int64_t *) p, vdup_n_s64((int64_t) a));
+}
+
+// Subtract packed 16-bit integers in b from packed 16-bit integers in a, and
+// store the results in dst.
+// https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_sub_epi16
+FORCE_INLINE __m128i _mm_sub_epi16(__m128i a, __m128i b)
+{
+    return vreinterpretq_m128i_s16(
+        vsubq_s16(vreinterpretq_s16_m128i(a), vreinterpretq_s16_m128i(b)));
+}
+
+// Subtract packed 32-bit integers in b from packed 32-bit integers in a, and
+// store the results in dst.
+// https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_sub_epi32
+FORCE_INLINE __m128i _mm_sub_epi32(__m128i a, __m128i b)
+{
+    return vreinterpretq_m128i_s32(
+        vsubq_s32(vreinterpretq_s32_m128i(a), vreinterpretq_s32_m128i(b)));
+}
+
+// Subtract packed 64-bit integers in b from packed 64-bit integers in a, and
+// store the results in dst.
+// https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_sub_epi64
+FORCE_INLINE __m128i _mm_sub_epi64(__m128i a, __m128i b)
+{
+    return vreinterpretq_m128i_s64(
+        vsubq_s64(vreinterpretq_s64_m128i(a), vreinterpretq_s64_m128i(b)));
+}
+
+// Subtract packed 8-bit integers in b from packed 8-bit integers in a, and
+// store the results in dst.
+// https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_sub_epi8
+FORCE_INLINE __m128i _mm_sub_epi8(__m128i a, __m128i b)
+{
+    return vreinterpretq_m128i_s8(
+        vsubq_s8(vreinterpretq_s8_m128i(a), vreinterpretq_s8_m128i(b)));
+}
+
+// Subtract packed double-precision (64-bit) floating-point elements in b from
+// packed double-precision (64-bit) floating-point elements in a, and store the
+// results in dst.
+//  https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=mm_sub_pd
+FORCE_INLINE __m128d _mm_sub_pd(__m128d a, __m128d b)
+{
+#if defined(__aarch64__) || defined(_M_ARM64)
+    return vreinterpretq_m128d_f64(
+        vsubq_f64(vreinterpretq_f64_m128d(a), vreinterpretq_f64_m128d(b)));
+#else
+    double a0 =
+        sse2neon_recast_u64_f64(vgetq_lane_u64(vreinterpretq_u64_m128d(a), 0));
+    double a1 =
+        sse2neon_recast_u64_f64(vgetq_lane_u64(vreinterpretq_u64_m128d(a), 1));
+    double b0 =
+        sse2neon_recast_u64_f64(vgetq_lane_u64(vreinterpretq_u64_m128d(b), 0));
+    double b1 =
+        sse2neon_recast_u64_f64(vgetq_lane_u64(vreinterpretq_u64_m128d(b), 1));
+    double c[2];
+    c[0] = a0 - b0;
+    c[1] = a1 - b1;
+    return vld1q_f32((float32_t *) c);
+#endif
+}
+
+// Subtract the lower double-precision (64-bit) floating-point element in b from
+// the lower double-precision (64-bit) floating-point element in a, store the
+// result in the lower element of dst, and copy the upper element from a to the
+// upper element of dst.
+// https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_sub_sd
+FORCE_INLINE __m128d _mm_sub_sd(__m128d a, __m128d b)
+{
+    return _mm_move_sd(a, _mm_sub_pd(a, b));
+}
+
+// Subtract 64-bit integer b from 64-bit integer a, and store the result in dst.
+// https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_sub_si64
+FORCE_INLINE __m64 _mm_sub_si64(__m64 a, __m64 b)
+{
+    return vreinterpret_m64_s64(
+        vsub_s64(vreinterpret_s64_m64(a), vreinterpret_s64_m64(b)));
+}
+
+// Subtract packed signed 16-bit integers in b from packed 16-bit integers in a
+// using saturation, and store the results in dst.
+// https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_subs_epi16
+FORCE_INLINE __m128i _mm_subs_epi16(__m128i a, __m128i b)
+{
+    return vreinterpretq_m128i_s16(
+        vqsubq_s16(vreinterpretq_s16_m128i(a), vreinterpretq_s16_m128i(b)));
+}
+
+// Subtract packed signed 8-bit integers in b from packed 8-bit integers in a
+// using saturation, and store the results in dst.
+// https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_subs_epi8
+FORCE_INLINE __m128i _mm_subs_epi8(__m128i a, __m128i b)
+{
+    return vreinterpretq_m128i_s8(
+        vqsubq_s8(vreinterpretq_s8_m128i(a), vreinterpretq_s8_m128i(b)));
+}
+
+// Subtract packed unsigned 16-bit integers in b from packed unsigned 16-bit
+// integers in a using saturation, and store the results in dst.
+// https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_subs_epu16
+FORCE_INLINE __m128i _mm_subs_epu16(__m128i a, __m128i b)
+{
+    return vreinterpretq_m128i_u16(
+        vqsubq_u16(vreinterpretq_u16_m128i(a), vreinterpretq_u16_m128i(b)));
+}
+
+// Subtract packed unsigned 8-bit integers in b from packed unsigned 8-bit
+// integers in a using saturation, and store the results in dst.
+// https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_subs_epu8
+FORCE_INLINE __m128i _mm_subs_epu8(__m128i a, __m128i b)
+{
+    return vreinterpretq_m128i_u8(
+        vqsubq_u8(vreinterpretq_u8_m128i(a), vreinterpretq_u8_m128i(b)));
+}
+
+#define _mm_ucomieq_sd _mm_comieq_sd
+#define _mm_ucomige_sd _mm_comige_sd
+#define _mm_ucomigt_sd _mm_comigt_sd
+#define _mm_ucomile_sd _mm_comile_sd
+#define _mm_ucomilt_sd _mm_comilt_sd
+#define _mm_ucomineq_sd _mm_comineq_sd
+
+// Return vector of type __m128d with undefined elements.
+// https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_undefined_pd
+FORCE_INLINE __m128d _mm_undefined_pd(void)
+{
+#if defined(__GNUC__) || defined(__clang__)
+#pragma GCC diagnostic push
+#pragma GCC diagnostic ignored "-Wuninitialized"
+#endif
+    __m128d a;
+#if defined(_MSC_VER) && !defined(__clang__)
+    a = _mm_setzero_pd();
+#endif
+    return a;
+#if defined(__GNUC__) || defined(__clang__)
+#pragma GCC diagnostic pop
+#endif
+}
+
+// Unpack and interleave 16-bit integers from the high half of a and b, and
+// store the results in dst.
+// https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_unpackhi_epi16
+FORCE_INLINE __m128i _mm_unpackhi_epi16(__m128i a, __m128i b)
+{
+#if defined(__aarch64__) || defined(_M_ARM64)
+    return vreinterpretq_m128i_s16(
+        vzip2q_s16(vreinterpretq_s16_m128i(a), vreinterpretq_s16_m128i(b)));
+#else
+    int16x4_t a1 = vget_high_s16(vreinterpretq_s16_m128i(a));
+    int16x4_t b1 = vget_high_s16(vreinterpretq_s16_m128i(b));
+    int16x4x2_t result = vzip_s16(a1, b1);
+    return vreinterpretq_m128i_s16(vcombine_s16(result.val[0], result.val[1]));
+#endif
+}
+
+// Unpack and interleave 32-bit integers from the high half of a and b, and
+// store the results in dst.
+// https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_unpackhi_epi32
+FORCE_INLINE __m128i _mm_unpackhi_epi32(__m128i a, __m128i b)
+{
+#if defined(__aarch64__) || defined(_M_ARM64)
+    return vreinterpretq_m128i_s32(
+        vzip2q_s32(vreinterpretq_s32_m128i(a), vreinterpretq_s32_m128i(b)));
+#else
+    int32x2_t a1 = vget_high_s32(vreinterpretq_s32_m128i(a));
+    int32x2_t b1 = vget_high_s32(vreinterpretq_s32_m128i(b));
+    int32x2x2_t result = vzip_s32(a1, b1);
+    return vreinterpretq_m128i_s32(vcombine_s32(result.val[0], result.val[1]));
+#endif
+}
+
+// Unpack and interleave 64-bit integers from the high half of a and b, and
+// store the results in dst.
+// https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_unpackhi_epi64
+FORCE_INLINE __m128i _mm_unpackhi_epi64(__m128i a, __m128i b)
+{
+#if defined(__aarch64__) || defined(_M_ARM64)
+    return vreinterpretq_m128i_s64(
+        vzip2q_s64(vreinterpretq_s64_m128i(a), vreinterpretq_s64_m128i(b)));
+#else
+    int64x1_t a_h = vget_high_s64(vreinterpretq_s64_m128i(a));
+    int64x1_t b_h = vget_high_s64(vreinterpretq_s64_m128i(b));
+    return vreinterpretq_m128i_s64(vcombine_s64(a_h, b_h));
+#endif
+}
+
+// Unpack and interleave 8-bit integers from the high half of a and b, and store
+// the results in dst.
+// https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_unpackhi_epi8
+FORCE_INLINE __m128i _mm_unpackhi_epi8(__m128i a, __m128i b)
+{
+#if defined(__aarch64__) || defined(_M_ARM64)
+    return vreinterpretq_m128i_s8(
+        vzip2q_s8(vreinterpretq_s8_m128i(a), vreinterpretq_s8_m128i(b)));
+#else
+    int8x8_t a1 =
+        vreinterpret_s8_s16(vget_high_s16(vreinterpretq_s16_m128i(a)));
+    int8x8_t b1 =
+        vreinterpret_s8_s16(vget_high_s16(vreinterpretq_s16_m128i(b)));
+    int8x8x2_t result = vzip_s8(a1, b1);
+    return vreinterpretq_m128i_s8(vcombine_s8(result.val[0], result.val[1]));
+#endif
+}
+
+// Unpack and interleave double-precision (64-bit) floating-point elements from
+// the high half of a and b, and store the results in dst.
+// https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_unpackhi_pd
+FORCE_INLINE __m128d _mm_unpackhi_pd(__m128d a, __m128d b)
+{
+#if defined(__aarch64__) || defined(_M_ARM64)
+    return vreinterpretq_m128d_f64(
+        vzip2q_f64(vreinterpretq_f64_m128d(a), vreinterpretq_f64_m128d(b)));
+#else
+    return vreinterpretq_m128d_s64(
+        vcombine_s64(vget_high_s64(vreinterpretq_s64_m128d(a)),
+                     vget_high_s64(vreinterpretq_s64_m128d(b))));
+#endif
+}
+
+// Unpack and interleave 16-bit integers from the low half of a and b, and store
+// the results in dst.
+// https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_unpacklo_epi16
+FORCE_INLINE __m128i _mm_unpacklo_epi16(__m128i a, __m128i b)
+{
+#if defined(__aarch64__) || defined(_M_ARM64)
+    return vreinterpretq_m128i_s16(
+        vzip1q_s16(vreinterpretq_s16_m128i(a), vreinterpretq_s16_m128i(b)));
+#else
+    int16x4_t a1 = vget_low_s16(vreinterpretq_s16_m128i(a));
+    int16x4_t b1 = vget_low_s16(vreinterpretq_s16_m128i(b));
+    int16x4x2_t result = vzip_s16(a1, b1);
+    return vreinterpretq_m128i_s16(vcombine_s16(result.val[0], result.val[1]));
+#endif
+}
+
+// Unpack and interleave 32-bit integers from the low half of a and b, and store
+// the results in dst.
+// https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_unpacklo_epi32
+FORCE_INLINE __m128i _mm_unpacklo_epi32(__m128i a, __m128i b)
+{
+#if defined(__aarch64__) || defined(_M_ARM64)
+    return vreinterpretq_m128i_s32(
+        vzip1q_s32(vreinterpretq_s32_m128i(a), vreinterpretq_s32_m128i(b)));
+#else
+    int32x2_t a1 = vget_low_s32(vreinterpretq_s32_m128i(a));
+    int32x2_t b1 = vget_low_s32(vreinterpretq_s32_m128i(b));
+    int32x2x2_t result = vzip_s32(a1, b1);
+    return vreinterpretq_m128i_s32(vcombine_s32(result.val[0], result.val[1]));
+#endif
+}
+
+// Unpack and interleave 64-bit integers from the low half of a and b, and store
+// the results in dst.
+// https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_unpacklo_epi64
+FORCE_INLINE __m128i _mm_unpacklo_epi64(__m128i a, __m128i b)
+{
+#if defined(__aarch64__) || defined(_M_ARM64)
+    return vreinterpretq_m128i_s64(
+        vzip1q_s64(vreinterpretq_s64_m128i(a), vreinterpretq_s64_m128i(b)));
+#else
+    int64x1_t a_l = vget_low_s64(vreinterpretq_s64_m128i(a));
+    int64x1_t b_l = vget_low_s64(vreinterpretq_s64_m128i(b));
+    return vreinterpretq_m128i_s64(vcombine_s64(a_l, b_l));
+#endif
+}
+
+// Unpack and interleave 8-bit integers from the low half of a and b, and store
+// the results in dst.
+// https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_unpacklo_epi8
+FORCE_INLINE __m128i _mm_unpacklo_epi8(__m128i a, __m128i b)
+{
+#if defined(__aarch64__) || defined(_M_ARM64)
+    return vreinterpretq_m128i_s8(
+        vzip1q_s8(vreinterpretq_s8_m128i(a), vreinterpretq_s8_m128i(b)));
+#else
+    int8x8_t a1 = vreinterpret_s8_s16(vget_low_s16(vreinterpretq_s16_m128i(a)));
+    int8x8_t b1 = vreinterpret_s8_s16(vget_low_s16(vreinterpretq_s16_m128i(b)));
+    int8x8x2_t result = vzip_s8(a1, b1);
+    return vreinterpretq_m128i_s8(vcombine_s8(result.val[0], result.val[1]));
+#endif
+}
+
+// Unpack and interleave double-precision (64-bit) floating-point elements from
+// the low half of a and b, and store the results in dst.
+// https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_unpacklo_pd
+FORCE_INLINE __m128d _mm_unpacklo_pd(__m128d a, __m128d b)
+{
+#if defined(__aarch64__) || defined(_M_ARM64)
+    return vreinterpretq_m128d_f64(
+        vzip1q_f64(vreinterpretq_f64_m128d(a), vreinterpretq_f64_m128d(b)));
+#else
+    return vreinterpretq_m128d_s64(
+        vcombine_s64(vget_low_s64(vreinterpretq_s64_m128d(a)),
+                     vget_low_s64(vreinterpretq_s64_m128d(b))));
+#endif
+}
+
+// Compute the bitwise XOR of packed double-precision (64-bit) floating-point
+// elements in a and b, and store the results in dst.
+// https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_xor_pd
+FORCE_INLINE __m128d _mm_xor_pd(__m128d a, __m128d b)
+{
+    return vreinterpretq_m128d_s64(
+        veorq_s64(vreinterpretq_s64_m128d(a), vreinterpretq_s64_m128d(b)));
+}
+
+// Compute the bitwise XOR of 128 bits (representing integer data) in a and b,
+// and store the result in dst.
+// https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_xor_si128
+FORCE_INLINE __m128i _mm_xor_si128(__m128i a, __m128i b)
+{
+    return vreinterpretq_m128i_s32(
+        veorq_s32(vreinterpretq_s32_m128i(a), vreinterpretq_s32_m128i(b)));
+}
+
+/* SSE3 */
+
+// Alternatively add and subtract packed double-precision (64-bit)
+// floating-point elements in a to/from packed elements in b, and store the
+// results in dst.
+// https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_addsub_pd
+FORCE_INLINE __m128d _mm_addsub_pd(__m128d a, __m128d b)
+{
+    _sse2neon_const __m128d mask = _mm_set_pd(1.0f, -1.0f);
+#if defined(__aarch64__) || defined(_M_ARM64)
+    return vreinterpretq_m128d_f64(vfmaq_f64(vreinterpretq_f64_m128d(a),
+                                             vreinterpretq_f64_m128d(b),
+                                             vreinterpretq_f64_m128d(mask)));
+#else
+    return _mm_add_pd(_mm_mul_pd(b, mask), a);
+#endif
+}
+
+// Alternatively add and subtract packed single-precision (32-bit)
+// floating-point elements in a to/from packed elements in b, and store the
+// results in dst.
+// https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=addsub_ps
+FORCE_INLINE __m128 _mm_addsub_ps(__m128 a, __m128 b)
+{
+    _sse2neon_const __m128 mask = _mm_setr_ps(-1.0f, 1.0f, -1.0f, 1.0f);
+#if (defined(__aarch64__) || defined(_M_ARM64)) || \
+    defined(__ARM_FEATURE_FMA) /* VFPv4+ */
+    return vreinterpretq_m128_f32(vfmaq_f32(vreinterpretq_f32_m128(a),
+                                            vreinterpretq_f32_m128(mask),
+                                            vreinterpretq_f32_m128(b)));
+#else
+    return _mm_add_ps(_mm_mul_ps(b, mask), a);
+#endif
+}
+
+// Horizontally add adjacent pairs of double-precision (64-bit) floating-point
+// elements in a and b, and pack the results in dst.
+// https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_hadd_pd
+FORCE_INLINE __m128d _mm_hadd_pd(__m128d a, __m128d b)
+{
+#if defined(__aarch64__) || defined(_M_ARM64)
+    return vreinterpretq_m128d_f64(
+        vpaddq_f64(vreinterpretq_f64_m128d(a), vreinterpretq_f64_m128d(b)));
+#else
+    double a0 =
+        sse2neon_recast_u64_f64(vgetq_lane_u64(vreinterpretq_u64_m128d(a), 0));
+    double a1 =
+        sse2neon_recast_u64_f64(vgetq_lane_u64(vreinterpretq_u64_m128d(a), 1));
+    double b0 =
+        sse2neon_recast_u64_f64(vgetq_lane_u64(vreinterpretq_u64_m128d(b), 0));
+    double b1 =
+        sse2neon_recast_u64_f64(vgetq_lane_u64(vreinterpretq_u64_m128d(b), 1));
+    double c[] = {a0 + a1, b0 + b1};
+    return vreinterpretq_m128d_u64(vld1q_u64((uint64_t *) c));
+#endif
+}
+
+// Horizontally add adjacent pairs of single-precision (32-bit) floating-point
+// elements in a and b, and pack the results in dst.
+// https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_hadd_ps
+FORCE_INLINE __m128 _mm_hadd_ps(__m128 a, __m128 b)
+{
+#if defined(__aarch64__) || defined(_M_ARM64)
+    return vreinterpretq_m128_f32(
+        vpaddq_f32(vreinterpretq_f32_m128(a), vreinterpretq_f32_m128(b)));
+#else
+    float32x2_t a10 = vget_low_f32(vreinterpretq_f32_m128(a));
+    float32x2_t a32 = vget_high_f32(vreinterpretq_f32_m128(a));
+    float32x2_t b10 = vget_low_f32(vreinterpretq_f32_m128(b));
+    float32x2_t b32 = vget_high_f32(vreinterpretq_f32_m128(b));
+    return vreinterpretq_m128_f32(
+        vcombine_f32(vpadd_f32(a10, a32), vpadd_f32(b10, b32)));
+#endif
+}
+
+// Horizontally subtract adjacent pairs of double-precision (64-bit)
+// floating-point elements in a and b, and pack the results in dst.
+// https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_hsub_pd
+FORCE_INLINE __m128d _mm_hsub_pd(__m128d a, __m128d b)
+{
+#if defined(__aarch64__) || defined(_M_ARM64)
+    float64x2_t _a = vreinterpretq_f64_m128d(a);
+    float64x2_t _b = vreinterpretq_f64_m128d(b);
+    return vreinterpretq_m128d_f64(
+        vsubq_f64(vuzp1q_f64(_a, _b), vuzp2q_f64(_a, _b)));
+#else
+    double a0 =
+        sse2neon_recast_u64_f64(vgetq_lane_u64(vreinterpretq_u64_m128d(a), 0));
+    double a1 =
+        sse2neon_recast_u64_f64(vgetq_lane_u64(vreinterpretq_u64_m128d(a), 1));
+    double b0 =
+        sse2neon_recast_u64_f64(vgetq_lane_u64(vreinterpretq_u64_m128d(b), 0));
+    double b1 =
+        sse2neon_recast_u64_f64(vgetq_lane_u64(vreinterpretq_u64_m128d(b), 1));
+    double c[] = {a0 - a1, b0 - b1};
+    return vreinterpretq_m128d_u64(vld1q_u64((uint64_t *) c));
+#endif
+}
+
+// Horizontally subtract adjacent pairs of single-precision (32-bit)
+// floating-point elements in a and b, and pack the results in dst.
+// https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_hsub_ps
+FORCE_INLINE __m128 _mm_hsub_ps(__m128 _a, __m128 _b)
+{
+    float32x4_t a = vreinterpretq_f32_m128(_a);
+    float32x4_t b = vreinterpretq_f32_m128(_b);
+#if defined(__aarch64__) || defined(_M_ARM64)
+    return vreinterpretq_m128_f32(
+        vsubq_f32(vuzp1q_f32(a, b), vuzp2q_f32(a, b)));
+#else
+    float32x4x2_t c = vuzpq_f32(a, b);
+    return vreinterpretq_m128_f32(vsubq_f32(c.val[0], c.val[1]));
+#endif
+}
+
+// Load 128-bits of integer data from unaligned memory into dst. This intrinsic
+// may perform better than _mm_loadu_si128 when the data crosses a cache line
+// boundary.
+// https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_lddqu_si128
+#define _mm_lddqu_si128 _mm_loadu_si128
+
+// Load a double-precision (64-bit) floating-point element from memory into both
+// elements of dst.
+// https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_loaddup_pd
+#define _mm_loaddup_pd _mm_load1_pd
+
+// Duplicate the low double-precision (64-bit) floating-point element from a,
+// and store the results in dst.
+// https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_movedup_pd
+FORCE_INLINE __m128d _mm_movedup_pd(__m128d a)
+{
+#if defined(__aarch64__) || defined(_M_ARM64)
+    return vreinterpretq_m128d_f64(
+        vdupq_laneq_f64(vreinterpretq_f64_m128d(a), 0));
+#else
+    return vreinterpretq_m128d_u64(
+        vdupq_n_u64(vgetq_lane_u64(vreinterpretq_u64_m128d(a), 0)));
+#endif
+}
+
+// Duplicate odd-indexed single-precision (32-bit) floating-point elements
+// from a, and store the results in dst.
+// https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_movehdup_ps
+FORCE_INLINE __m128 _mm_movehdup_ps(__m128 a)
+{
+#if defined(__aarch64__) || defined(_M_ARM64)
+    return vreinterpretq_m128_f32(
+        vtrn2q_f32(vreinterpretq_f32_m128(a), vreinterpretq_f32_m128(a)));
+#elif defined(_sse2neon_shuffle)
+    return vreinterpretq_m128_f32(vshuffleq_s32(
+        vreinterpretq_f32_m128(a), vreinterpretq_f32_m128(a), 1, 1, 3, 3));
+#else
+    float32_t a1 = vgetq_lane_f32(vreinterpretq_f32_m128(a), 1);
+    float32_t a3 = vgetq_lane_f32(vreinterpretq_f32_m128(a), 3);
+    float ALIGN_STRUCT(16) data[4] = {a1, a1, a3, a3};
+    return vreinterpretq_m128_f32(vld1q_f32(data));
+#endif
+}
+
+// Duplicate even-indexed single-precision (32-bit) floating-point elements
+// from a, and store the results in dst.
+// https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_moveldup_ps
+FORCE_INLINE __m128 _mm_moveldup_ps(__m128 a)
+{
+#if defined(__aarch64__) || defined(_M_ARM64)
+    return vreinterpretq_m128_f32(
+        vtrn1q_f32(vreinterpretq_f32_m128(a), vreinterpretq_f32_m128(a)));
+#elif defined(_sse2neon_shuffle)
+    return vreinterpretq_m128_f32(vshuffleq_s32(
+        vreinterpretq_f32_m128(a), vreinterpretq_f32_m128(a), 0, 0, 2, 2));
+#else
+    float32_t a0 = vgetq_lane_f32(vreinterpretq_f32_m128(a), 0);
+    float32_t a2 = vgetq_lane_f32(vreinterpretq_f32_m128(a), 2);
+    float ALIGN_STRUCT(16) data[4] = {a0, a0, a2, a2};
+    return vreinterpretq_m128_f32(vld1q_f32(data));
+#endif
+}
+
+/* SSSE3 */
+
+// Compute the absolute value of packed signed 16-bit integers in a, and store
+// the unsigned results in dst.
+// https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_abs_epi16
+FORCE_INLINE __m128i _mm_abs_epi16(__m128i a)
+{
+    return vreinterpretq_m128i_s16(vabsq_s16(vreinterpretq_s16_m128i(a)));
+}
+
+// Compute the absolute value of packed signed 32-bit integers in a, and store
+// the unsigned results in dst.
+// https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_abs_epi32
+FORCE_INLINE __m128i _mm_abs_epi32(__m128i a)
+{
+    return vreinterpretq_m128i_s32(vabsq_s32(vreinterpretq_s32_m128i(a)));
+}
+
+// Compute the absolute value of packed signed 8-bit integers in a, and store
+// the unsigned results in dst.
+// https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_abs_epi8
+FORCE_INLINE __m128i _mm_abs_epi8(__m128i a)
+{
+    return vreinterpretq_m128i_s8(vabsq_s8(vreinterpretq_s8_m128i(a)));
+}
+
+// Compute the absolute value of packed signed 16-bit integers in a, and store
+// the unsigned results in dst.
+// https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_abs_pi16
+FORCE_INLINE __m64 _mm_abs_pi16(__m64 a)
+{
+    return vreinterpret_m64_s16(vabs_s16(vreinterpret_s16_m64(a)));
+}
+
+// Compute the absolute value of packed signed 32-bit integers in a, and store
+// the unsigned results in dst.
+// https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_abs_pi32
+FORCE_INLINE __m64 _mm_abs_pi32(__m64 a)
+{
+    return vreinterpret_m64_s32(vabs_s32(vreinterpret_s32_m64(a)));
+}
+
+// Compute the absolute value of packed signed 8-bit integers in a, and store
+// the unsigned results in dst.
+// https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_abs_pi8
+FORCE_INLINE __m64 _mm_abs_pi8(__m64 a)
+{
+    return vreinterpret_m64_s8(vabs_s8(vreinterpret_s8_m64(a)));
+}
+
+// Concatenate 16-byte blocks in a and b into a 32-byte temporary result, shift
+// the result right by imm8 bytes, and store the low 16 bytes in dst.
+// https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_alignr_epi8
+#if defined(__GNUC__) && !defined(__clang__)
+#define _mm_alignr_epi8(a, b, imm)                                            \
+    __extension__({                                                           \
+        uint8x16_t _a = vreinterpretq_u8_m128i(a);                            \
+        uint8x16_t _b = vreinterpretq_u8_m128i(b);                            \
+        __m128i ret;                                                          \
+        if (_sse2neon_unlikely((imm) & ~31))                                  \
+            ret = vreinterpretq_m128i_u8(vdupq_n_u8(0));                      \
+        else if (imm >= 16)                                                   \
+            ret = _mm_srli_si128(a, imm >= 16 ? imm - 16 : 0);                \
+        else                                                                  \
+            ret =                                                             \
+                vreinterpretq_m128i_u8(vextq_u8(_b, _a, imm < 16 ? imm : 0)); \
+        ret;                                                                  \
+    })
+
+#else
+#define _mm_alignr_epi8(a, b, imm)                                          \
+    _sse2neon_define2(                                                      \
+        __m128i, a, b, uint8x16_t __a = vreinterpretq_u8_m128i(_a);         \
+        uint8x16_t __b = vreinterpretq_u8_m128i(_b); __m128i ret;           \
+        if (_sse2neon_unlikely((imm) & ~31)) ret =                          \
+            vreinterpretq_m128i_u8(vdupq_n_u8(0));                          \
+        else if (imm >= 16) ret =                                           \
+            _mm_srli_si128(_a, imm >= 16 ? imm - 16 : 0);                   \
+        else ret =                                                          \
+            vreinterpretq_m128i_u8(vextq_u8(__b, __a, imm < 16 ? imm : 0)); \
+        _sse2neon_return(ret);)
+
+#endif
+
+// Concatenate 8-byte blocks in a and b into a 16-byte temporary result, shift
+// the result right by imm8 bytes, and store the low 8 bytes in dst.
+// https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_alignr_pi8
+#define _mm_alignr_pi8(a, b, imm)                                           \
+    _sse2neon_define2(                                                      \
+        __m64, a, b, __m64 ret; if (_sse2neon_unlikely((imm) >= 16)) {      \
+            ret = vreinterpret_m64_s8(vdup_n_s8(0));                        \
+        } else {                                                            \
+            uint8x8_t tmp_low;                                              \
+            uint8x8_t tmp_high;                                             \
+            if ((imm) >= 8) {                                               \
+                const int idx = (imm) -8;                                   \
+                tmp_low = vreinterpret_u8_m64(_a);                          \
+                tmp_high = vdup_n_u8(0);                                    \
+                ret = vreinterpret_m64_u8(vext_u8(tmp_low, tmp_high, idx)); \
+            } else {                                                        \
+                const int idx = (imm);                                      \
+                tmp_low = vreinterpret_u8_m64(_b);                          \
+                tmp_high = vreinterpret_u8_m64(_a);                         \
+                ret = vreinterpret_m64_u8(vext_u8(tmp_low, tmp_high, idx)); \
+            }                                                               \
+        } _sse2neon_return(ret);)
+
+// Horizontally add adjacent pairs of 16-bit integers in a and b, and pack the
+// signed 16-bit results in dst.
+// https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_hadd_epi16
+FORCE_INLINE __m128i _mm_hadd_epi16(__m128i _a, __m128i _b)
+{
+    int16x8_t a = vreinterpretq_s16_m128i(_a);
+    int16x8_t b = vreinterpretq_s16_m128i(_b);
+#if defined(__aarch64__) || defined(_M_ARM64)
+    return vreinterpretq_m128i_s16(vpaddq_s16(a, b));
+#else
+    return vreinterpretq_m128i_s16(
+        vcombine_s16(vpadd_s16(vget_low_s16(a), vget_high_s16(a)),
+                     vpadd_s16(vget_low_s16(b), vget_high_s16(b))));
+#endif
+}
+
+// Horizontally add adjacent pairs of 32-bit integers in a and b, and pack the
+// signed 32-bit results in dst.
+// https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_hadd_epi32
+FORCE_INLINE __m128i _mm_hadd_epi32(__m128i _a, __m128i _b)
+{
+    int32x4_t a = vreinterpretq_s32_m128i(_a);
+    int32x4_t b = vreinterpretq_s32_m128i(_b);
+#if defined(__aarch64__) || defined(_M_ARM64)
+    return vreinterpretq_m128i_s32(vpaddq_s32(a, b));
+#else
+    return vreinterpretq_m128i_s32(
+        vcombine_s32(vpadd_s32(vget_low_s32(a), vget_high_s32(a)),
+                     vpadd_s32(vget_low_s32(b), vget_high_s32(b))));
+#endif
+}
+
+// Horizontally add adjacent pairs of 16-bit integers in a and b, and pack the
+// signed 16-bit results in dst.
+// https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_hadd_pi16
+FORCE_INLINE __m64 _mm_hadd_pi16(__m64 a, __m64 b)
+{
+    return vreinterpret_m64_s16(
+        vpadd_s16(vreinterpret_s16_m64(a), vreinterpret_s16_m64(b)));
+}
+
+// Horizontally add adjacent pairs of 32-bit integers in a and b, and pack the
+// signed 32-bit results in dst.
+// https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_hadd_pi32
+FORCE_INLINE __m64 _mm_hadd_pi32(__m64 a, __m64 b)
+{
+    return vreinterpret_m64_s32(
+        vpadd_s32(vreinterpret_s32_m64(a), vreinterpret_s32_m64(b)));
+}
+
+// Horizontally add adjacent pairs of signed 16-bit integers in a and b using
+// saturation, and pack the signed 16-bit results in dst.
+// https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_hadds_epi16
+FORCE_INLINE __m128i _mm_hadds_epi16(__m128i _a, __m128i _b)
+{
+#if defined(__aarch64__) || defined(_M_ARM64)
+    int16x8_t a = vreinterpretq_s16_m128i(_a);
+    int16x8_t b = vreinterpretq_s16_m128i(_b);
+    return vreinterpretq_s64_s16(
+        vqaddq_s16(vuzp1q_s16(a, b), vuzp2q_s16(a, b)));
+#else
+    int32x4_t a = vreinterpretq_s32_m128i(_a);
+    int32x4_t b = vreinterpretq_s32_m128i(_b);
+    // Interleave using vshrn/vmovn
+    // [a0|a2|a4|a6|b0|b2|b4|b6]
+    // [a1|a3|a5|a7|b1|b3|b5|b7]
+    int16x8_t ab0246 = vcombine_s16(vmovn_s32(a), vmovn_s32(b));
+    int16x8_t ab1357 = vcombine_s16(vshrn_n_s32(a, 16), vshrn_n_s32(b, 16));
+    // Saturated add
+    return vreinterpretq_m128i_s16(vqaddq_s16(ab0246, ab1357));
+#endif
+}
+
+// Horizontally add adjacent pairs of signed 16-bit integers in a and b using
+// saturation, and pack the signed 16-bit results in dst.
+// https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_hadds_pi16
+FORCE_INLINE __m64 _mm_hadds_pi16(__m64 _a, __m64 _b)
+{
+    int16x4_t a = vreinterpret_s16_m64(_a);
+    int16x4_t b = vreinterpret_s16_m64(_b);
+#if defined(__aarch64__) || defined(_M_ARM64)
+    return vreinterpret_s64_s16(vqadd_s16(vuzp1_s16(a, b), vuzp2_s16(a, b)));
+#else
+    int16x4x2_t res = vuzp_s16(a, b);
+    return vreinterpret_s64_s16(vqadd_s16(res.val[0], res.val[1]));
+#endif
+}
+
+// Horizontally subtract adjacent pairs of 16-bit integers in a and b, and pack
+// the signed 16-bit results in dst.
+// https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_hsub_epi16
+FORCE_INLINE __m128i _mm_hsub_epi16(__m128i _a, __m128i _b)
+{
+    int16x8_t a = vreinterpretq_s16_m128i(_a);
+    int16x8_t b = vreinterpretq_s16_m128i(_b);
+#if defined(__aarch64__) || defined(_M_ARM64)
+    return vreinterpretq_m128i_s16(
+        vsubq_s16(vuzp1q_s16(a, b), vuzp2q_s16(a, b)));
+#else
+    int16x8x2_t c = vuzpq_s16(a, b);
+    return vreinterpretq_m128i_s16(vsubq_s16(c.val[0], c.val[1]));
+#endif
+}
+
+// Horizontally subtract adjacent pairs of 32-bit integers in a and b, and pack
+// the signed 32-bit results in dst.
+// https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_hsub_epi32
+FORCE_INLINE __m128i _mm_hsub_epi32(__m128i _a, __m128i _b)
+{
+    int32x4_t a = vreinterpretq_s32_m128i(_a);
+    int32x4_t b = vreinterpretq_s32_m128i(_b);
+#if defined(__aarch64__) || defined(_M_ARM64)
+    return vreinterpretq_m128i_s32(
+        vsubq_s32(vuzp1q_s32(a, b), vuzp2q_s32(a, b)));
+#else
+    int32x4x2_t c = vuzpq_s32(a, b);
+    return vreinterpretq_m128i_s32(vsubq_s32(c.val[0], c.val[1]));
+#endif
+}
+
+// Horizontally subtract adjacent pairs of 16-bit integers in a and b, and pack
+// the signed 16-bit results in dst.
+// https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_hsub_pi16
+FORCE_INLINE __m64 _mm_hsub_pi16(__m64 _a, __m64 _b)
+{
+    int16x4_t a = vreinterpret_s16_m64(_a);
+    int16x4_t b = vreinterpret_s16_m64(_b);
+#if defined(__aarch64__) || defined(_M_ARM64)
+    return vreinterpret_m64_s16(vsub_s16(vuzp1_s16(a, b), vuzp2_s16(a, b)));
+#else
+    int16x4x2_t c = vuzp_s16(a, b);
+    return vreinterpret_m64_s16(vsub_s16(c.val[0], c.val[1]));
+#endif
+}
+
+// Horizontally subtract adjacent pairs of 32-bit integers in a and b, and pack
+// the signed 32-bit results in dst.
+// https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=mm_hsub_pi32
+FORCE_INLINE __m64 _mm_hsub_pi32(__m64 _a, __m64 _b)
+{
+    int32x2_t a = vreinterpret_s32_m64(_a);
+    int32x2_t b = vreinterpret_s32_m64(_b);
+#if defined(__aarch64__) || defined(_M_ARM64)
+    return vreinterpret_m64_s32(vsub_s32(vuzp1_s32(a, b), vuzp2_s32(a, b)));
+#else
+    int32x2x2_t c = vuzp_s32(a, b);
+    return vreinterpret_m64_s32(vsub_s32(c.val[0], c.val[1]));
+#endif
+}
+
+// Horizontally subtract adjacent pairs of signed 16-bit integers in a and b
+// using saturation, and pack the signed 16-bit results in dst.
+// https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_hsubs_epi16
+FORCE_INLINE __m128i _mm_hsubs_epi16(__m128i _a, __m128i _b)
+{
+    int16x8_t a = vreinterpretq_s16_m128i(_a);
+    int16x8_t b = vreinterpretq_s16_m128i(_b);
+#if defined(__aarch64__) || defined(_M_ARM64)
+    return vreinterpretq_m128i_s16(
+        vqsubq_s16(vuzp1q_s16(a, b), vuzp2q_s16(a, b)));
+#else
+    int16x8x2_t c = vuzpq_s16(a, b);
+    return vreinterpretq_m128i_s16(vqsubq_s16(c.val[0], c.val[1]));
+#endif
+}
+
+// Horizontally subtract adjacent pairs of signed 16-bit integers in a and b
+// using saturation, and pack the signed 16-bit results in dst.
+// https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_hsubs_pi16
+FORCE_INLINE __m64 _mm_hsubs_pi16(__m64 _a, __m64 _b)
+{
+    int16x4_t a = vreinterpret_s16_m64(_a);
+    int16x4_t b = vreinterpret_s16_m64(_b);
+#if defined(__aarch64__) || defined(_M_ARM64)
+    return vreinterpret_m64_s16(vqsub_s16(vuzp1_s16(a, b), vuzp2_s16(a, b)));
+#else
+    int16x4x2_t c = vuzp_s16(a, b);
+    return vreinterpret_m64_s16(vqsub_s16(c.val[0], c.val[1]));
+#endif
+}
+
+// Vertically multiply each unsigned 8-bit integer from a with the corresponding
+// signed 8-bit integer from b, producing intermediate signed 16-bit integers.
+// Horizontally add adjacent pairs of intermediate signed 16-bit integers,
+// and pack the saturated results in dst.
+// https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_maddubs_epi16
+FORCE_INLINE __m128i _mm_maddubs_epi16(__m128i _a, __m128i _b)
+{
+#if defined(__aarch64__) || defined(_M_ARM64)
+    uint8x16_t a = vreinterpretq_u8_m128i(_a);
+    int8x16_t b = vreinterpretq_s8_m128i(_b);
+    int16x8_t tl = vmulq_s16(vreinterpretq_s16_u16(vmovl_u8(vget_low_u8(a))),
+                             vmovl_s8(vget_low_s8(b)));
+    int16x8_t th = vmulq_s16(vreinterpretq_s16_u16(vmovl_u8(vget_high_u8(a))),
+                             vmovl_s8(vget_high_s8(b)));
+    return vreinterpretq_m128i_s16(
+        vqaddq_s16(vuzp1q_s16(tl, th), vuzp2q_s16(tl, th)));
+#else
+    // This would be much simpler if x86 would choose to zero extend OR sign
+    // extend, not both. This could probably be optimized better.
+    uint16x8_t a = vreinterpretq_u16_m128i(_a);
+    int16x8_t b = vreinterpretq_s16_m128i(_b);
+
+    // Zero extend a
+    int16x8_t a_odd = vreinterpretq_s16_u16(vshrq_n_u16(a, 8));
+    int16x8_t a_even = vreinterpretq_s16_u16(vbicq_u16(a, vdupq_n_u16(0xff00)));
+
+    // Sign extend by shifting left then shifting right.
+    int16x8_t b_even = vshrq_n_s16(vshlq_n_s16(b, 8), 8);
+    int16x8_t b_odd = vshrq_n_s16(b, 8);
+
+    // multiply
+    int16x8_t prod1 = vmulq_s16(a_even, b_even);
+    int16x8_t prod2 = vmulq_s16(a_odd, b_odd);
+
+    // saturated add
+    return vreinterpretq_m128i_s16(vqaddq_s16(prod1, prod2));
+#endif
+}
+
+// Vertically multiply each unsigned 8-bit integer from a with the corresponding
+// signed 8-bit integer from b, producing intermediate signed 16-bit integers.
+// Horizontally add adjacent pairs of intermediate signed 16-bit integers, and
+// pack the saturated results in dst.
+// https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_maddubs_pi16
+FORCE_INLINE __m64 _mm_maddubs_pi16(__m64 _a, __m64 _b)
+{
+    uint16x4_t a = vreinterpret_u16_m64(_a);
+    int16x4_t b = vreinterpret_s16_m64(_b);
+
+    // Zero extend a
+    int16x4_t a_odd = vreinterpret_s16_u16(vshr_n_u16(a, 8));
+    int16x4_t a_even = vreinterpret_s16_u16(vand_u16(a, vdup_n_u16(0xff)));
+
+    // Sign extend by shifting left then shifting right.
+    int16x4_t b_even = vshr_n_s16(vshl_n_s16(b, 8), 8);
+    int16x4_t b_odd = vshr_n_s16(b, 8);
+
+    // multiply
+    int16x4_t prod1 = vmul_s16(a_even, b_even);
+    int16x4_t prod2 = vmul_s16(a_odd, b_odd);
+
+    // saturated add
+    return vreinterpret_m64_s16(vqadd_s16(prod1, prod2));
+}
+
+// Multiply packed signed 16-bit integers in a and b, producing intermediate
+// signed 32-bit integers. Shift right by 15 bits while rounding up, and store
+// the packed 16-bit integers in dst.
+// https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_mulhrs_epi16
+FORCE_INLINE __m128i _mm_mulhrs_epi16(__m128i a, __m128i b)
+{
+    // Has issues due to saturation
+    // return vreinterpretq_m128i_s16(vqrdmulhq_s16(a, b));
+
+    // Multiply
+    int32x4_t mul_lo = vmull_s16(vget_low_s16(vreinterpretq_s16_m128i(a)),
+                                 vget_low_s16(vreinterpretq_s16_m128i(b)));
+    int32x4_t mul_hi = vmull_s16(vget_high_s16(vreinterpretq_s16_m128i(a)),
+                                 vget_high_s16(vreinterpretq_s16_m128i(b)));
+
+    // Rounding narrowing shift right
+    // narrow = (int16_t)((mul + 16384) >> 15);
+    int16x4_t narrow_lo = vrshrn_n_s32(mul_lo, 15);
+    int16x4_t narrow_hi = vrshrn_n_s32(mul_hi, 15);
+
+    // Join together
+    return vreinterpretq_m128i_s16(vcombine_s16(narrow_lo, narrow_hi));
+}
+
+// Multiply packed signed 16-bit integers in a and b, producing intermediate
+// signed 32-bit integers. Truncate each intermediate integer to the 18 most
+// significant bits, round by adding 1, and store bits [16:1] to dst.
+// https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_mulhrs_pi16
+FORCE_INLINE __m64 _mm_mulhrs_pi16(__m64 a, __m64 b)
+{
+    int32x4_t mul_extend =
+        vmull_s16((vreinterpret_s16_m64(a)), (vreinterpret_s16_m64(b)));
+
+    // Rounding narrowing shift right
+    return vreinterpret_m64_s16(vrshrn_n_s32(mul_extend, 15));
+}
+
+// Shuffle packed 8-bit integers in a according to shuffle control mask in the
+// corresponding 8-bit element of b, and store the results in dst.
+// https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_shuffle_epi8
+FORCE_INLINE __m128i _mm_shuffle_epi8(__m128i a, __m128i b)
+{
+    int8x16_t tbl = vreinterpretq_s8_m128i(a);   // input a
+    uint8x16_t idx = vreinterpretq_u8_m128i(b);  // input b
+    uint8x16_t idx_masked =
+        vandq_u8(idx, vdupq_n_u8(0x8F));  // avoid using meaningless bits
+#if defined(__aarch64__) || defined(_M_ARM64)
+    return vreinterpretq_m128i_s8(vqtbl1q_s8(tbl, idx_masked));
+#elif defined(__GNUC__)
+    int8x16_t ret;
+    // %e and %f represent the even and odd D registers
+    // respectively.
+    __asm__ __volatile__(
+        "vtbl.8  %e[ret], {%e[tbl], %f[tbl]}, %e[idx]\n"
+        "vtbl.8  %f[ret], {%e[tbl], %f[tbl]}, %f[idx]\n"
+        : [ret] "=&w"(ret)
+        : [tbl] "w"(tbl), [idx] "w"(idx_masked));
+    return vreinterpretq_m128i_s8(ret);
+#else
+    // use this line if testing on aarch64
+    int8x8x2_t a_split = {vget_low_s8(tbl), vget_high_s8(tbl)};
+    return vreinterpretq_m128i_s8(
+        vcombine_s8(vtbl2_s8(a_split, vget_low_u8(idx_masked)),
+                    vtbl2_s8(a_split, vget_high_u8(idx_masked))));
+#endif
+}
+
+// Shuffle packed 8-bit integers in a according to shuffle control mask in the
+// corresponding 8-bit element of b, and store the results in dst.
+// https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_shuffle_pi8
+FORCE_INLINE __m64 _mm_shuffle_pi8(__m64 a, __m64 b)
+{
+    const int8x8_t controlMask =
+        vand_s8(vreinterpret_s8_m64(b), vdup_n_s8((int8_t) (0x1 << 7 | 0x07)));
+    int8x8_t res = vtbl1_s8(vreinterpret_s8_m64(a), controlMask);
+    return vreinterpret_m64_s8(res);
+}
+
+// Negate packed 16-bit integers in a when the corresponding signed
+// 16-bit integer in b is negative, and store the results in dst.
+// Element in dst are zeroed out when the corresponding element
+// in b is zero.
+// https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_sign_epi16
+FORCE_INLINE __m128i _mm_sign_epi16(__m128i _a, __m128i _b)
+{
+    int16x8_t a = vreinterpretq_s16_m128i(_a);
+    int16x8_t b = vreinterpretq_s16_m128i(_b);
+
+    // signed shift right: faster than vclt
+    // (b < 0) ? 0xFFFF : 0
+    uint16x8_t ltMask = vreinterpretq_u16_s16(vshrq_n_s16(b, 15));
+    // (b == 0) ? 0xFFFF : 0
+#if defined(__aarch64__) || defined(_M_ARM64)
+    int16x8_t zeroMask = vreinterpretq_s16_u16(vceqzq_s16(b));
+#else
+    int16x8_t zeroMask = vreinterpretq_s16_u16(vceqq_s16(b, vdupq_n_s16(0)));
+#endif
+
+    // bitwise select either a or negative 'a' (vnegq_s16(a) equals to negative
+    // 'a') based on ltMask
+    int16x8_t masked = vbslq_s16(ltMask, vnegq_s16(a), a);
+    // res = masked & (~zeroMask)
+    int16x8_t res = vbicq_s16(masked, zeroMask);
+    return vreinterpretq_m128i_s16(res);
+}
+
+// Negate packed 32-bit integers in a when the corresponding signed
+// 32-bit integer in b is negative, and store the results in dst.
+// Element in dst are zeroed out when the corresponding element
+// in b is zero.
+// https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_sign_epi32
+FORCE_INLINE __m128i _mm_sign_epi32(__m128i _a, __m128i _b)
+{
+    int32x4_t a = vreinterpretq_s32_m128i(_a);
+    int32x4_t b = vreinterpretq_s32_m128i(_b);
+
+    // signed shift right: faster than vclt
+    // (b < 0) ? 0xFFFFFFFF : 0
+    uint32x4_t ltMask = vreinterpretq_u32_s32(vshrq_n_s32(b, 31));
+
+    // (b == 0) ? 0xFFFFFFFF : 0
+#if defined(__aarch64__) || defined(_M_ARM64)
+    int32x4_t zeroMask = vreinterpretq_s32_u32(vceqzq_s32(b));
+#else
+    int32x4_t zeroMask = vreinterpretq_s32_u32(vceqq_s32(b, vdupq_n_s32(0)));
+#endif
+
+    // bitwise select either a or negative 'a' (vnegq_s32(a) equals to negative
+    // 'a') based on ltMask
+    int32x4_t masked = vbslq_s32(ltMask, vnegq_s32(a), a);
+    // res = masked & (~zeroMask)
+    int32x4_t res = vbicq_s32(masked, zeroMask);
+    return vreinterpretq_m128i_s32(res);
+}
+
+// Negate packed 8-bit integers in a when the corresponding signed
+// 8-bit integer in b is negative, and store the results in dst.
+// Element in dst are zeroed out when the corresponding element
+// in b is zero.
+// https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_sign_epi8
+FORCE_INLINE __m128i _mm_sign_epi8(__m128i _a, __m128i _b)
+{
+    int8x16_t a = vreinterpretq_s8_m128i(_a);
+    int8x16_t b = vreinterpretq_s8_m128i(_b);
+
+    // signed shift right: faster than vclt
+    // (b < 0) ? 0xFF : 0
+    uint8x16_t ltMask = vreinterpretq_u8_s8(vshrq_n_s8(b, 7));
+
+    // (b == 0) ? 0xFF : 0
+#if defined(__aarch64__) || defined(_M_ARM64)
+    int8x16_t zeroMask = vreinterpretq_s8_u8(vceqzq_s8(b));
+#else
+    int8x16_t zeroMask = vreinterpretq_s8_u8(vceqq_s8(b, vdupq_n_s8(0)));
+#endif
+
+    // bitwise select either a or negative 'a' (vnegq_s8(a) return negative 'a')
+    // based on ltMask
+    int8x16_t masked = vbslq_s8(ltMask, vnegq_s8(a), a);
+    // res = masked & (~zeroMask)
+    int8x16_t res = vbicq_s8(masked, zeroMask);
+
+    return vreinterpretq_m128i_s8(res);
+}
+
+// Negate packed 16-bit integers in a when the corresponding signed 16-bit
+// integer in b is negative, and store the results in dst. Element in dst are
+// zeroed out when the corresponding element in b is zero.
+// https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_sign_pi16
+FORCE_INLINE __m64 _mm_sign_pi16(__m64 _a, __m64 _b)
+{
+    int16x4_t a = vreinterpret_s16_m64(_a);
+    int16x4_t b = vreinterpret_s16_m64(_b);
+
+    // signed shift right: faster than vclt
+    // (b < 0) ? 0xFFFF : 0
+    uint16x4_t ltMask = vreinterpret_u16_s16(vshr_n_s16(b, 15));
+
+    // (b == 0) ? 0xFFFF : 0
+#if defined(__aarch64__) || defined(_M_ARM64)
+    int16x4_t zeroMask = vreinterpret_s16_u16(vceqz_s16(b));
+#else
+    int16x4_t zeroMask = vreinterpret_s16_u16(vceq_s16(b, vdup_n_s16(0)));
+#endif
+
+    // bitwise select either a or negative 'a' (vneg_s16(a) return negative 'a')
+    // based on ltMask
+    int16x4_t masked = vbsl_s16(ltMask, vneg_s16(a), a);
+    // res = masked & (~zeroMask)
+    int16x4_t res = vbic_s16(masked, zeroMask);
+
+    return vreinterpret_m64_s16(res);
+}
+
+// Negate packed 32-bit integers in a when the corresponding signed 32-bit
+// integer in b is negative, and store the results in dst. Element in dst are
+// zeroed out when the corresponding element in b is zero.
+// https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_sign_pi32
+FORCE_INLINE __m64 _mm_sign_pi32(__m64 _a, __m64 _b)
+{
+    int32x2_t a = vreinterpret_s32_m64(_a);
+    int32x2_t b = vreinterpret_s32_m64(_b);
+
+    // signed shift right: faster than vclt
+    // (b < 0) ? 0xFFFFFFFF : 0
+    uint32x2_t ltMask = vreinterpret_u32_s32(vshr_n_s32(b, 31));
+
+    // (b == 0) ? 0xFFFFFFFF : 0
+#if defined(__aarch64__) || defined(_M_ARM64)
+    int32x2_t zeroMask = vreinterpret_s32_u32(vceqz_s32(b));
+#else
+    int32x2_t zeroMask = vreinterpret_s32_u32(vceq_s32(b, vdup_n_s32(0)));
+#endif
+
+    // bitwise select either a or negative 'a' (vneg_s32(a) return negative 'a')
+    // based on ltMask
+    int32x2_t masked = vbsl_s32(ltMask, vneg_s32(a), a);
+    // res = masked & (~zeroMask)
+    int32x2_t res = vbic_s32(masked, zeroMask);
+
+    return vreinterpret_m64_s32(res);
+}
+
+// Negate packed 8-bit integers in a when the corresponding signed 8-bit integer
+// in b is negative, and store the results in dst. Element in dst are zeroed out
+// when the corresponding element in b is zero.
+// https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_sign_pi8
+FORCE_INLINE __m64 _mm_sign_pi8(__m64 _a, __m64 _b)
+{
+    int8x8_t a = vreinterpret_s8_m64(_a);
+    int8x8_t b = vreinterpret_s8_m64(_b);
+
+    // signed shift right: faster than vclt
+    // (b < 0) ? 0xFF : 0
+    uint8x8_t ltMask = vreinterpret_u8_s8(vshr_n_s8(b, 7));
+
+    // (b == 0) ? 0xFF : 0
+#if defined(__aarch64__) || defined(_M_ARM64)
+    int8x8_t zeroMask = vreinterpret_s8_u8(vceqz_s8(b));
+#else
+    int8x8_t zeroMask = vreinterpret_s8_u8(vceq_s8(b, vdup_n_s8(0)));
+#endif
+
+    // bitwise select either a or negative 'a' (vneg_s8(a) return negative 'a')
+    // based on ltMask
+    int8x8_t masked = vbsl_s8(ltMask, vneg_s8(a), a);
+    // res = masked & (~zeroMask)
+    int8x8_t res = vbic_s8(masked, zeroMask);
+
+    return vreinterpret_m64_s8(res);
+}
+
+/* SSE4.1 */
+
+// Blend packed 16-bit integers from a and b using control mask imm8, and store
+// the results in dst.
+// https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_blend_epi16
+// FORCE_INLINE __m128i _mm_blend_epi16(__m128i a, __m128i b,
+//                                      __constrange(0,255) int imm)
+#define _mm_blend_epi16(a, b, imm)                                      \
+    _sse2neon_define2(                                                  \
+        __m128i, a, b,                                                  \
+        const uint16_t _mask[8] =                                       \
+            _sse2neon_init(((imm) & (1 << 0)) ? (uint16_t) -1 : 0x0,    \
+                           ((imm) & (1 << 1)) ? (uint16_t) -1 : 0x0,    \
+                           ((imm) & (1 << 2)) ? (uint16_t) -1 : 0x0,    \
+                           ((imm) & (1 << 3)) ? (uint16_t) -1 : 0x0,    \
+                           ((imm) & (1 << 4)) ? (uint16_t) -1 : 0x0,    \
+                           ((imm) & (1 << 5)) ? (uint16_t) -1 : 0x0,    \
+                           ((imm) & (1 << 6)) ? (uint16_t) -1 : 0x0,    \
+                           ((imm) & (1 << 7)) ? (uint16_t) -1 : 0x0);   \
+        uint16x8_t _mask_vec = vld1q_u16(_mask);                        \
+        uint16x8_t __a = vreinterpretq_u16_m128i(_a);                   \
+        uint16x8_t __b = vreinterpretq_u16_m128i(_b); _sse2neon_return( \
+            vreinterpretq_m128i_u16(vbslq_u16(_mask_vec, __b, __a)));)
+
+// Blend packed double-precision (64-bit) floating-point elements from a and b
+// using control mask imm8, and store the results in dst.
+// https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_blend_pd
+#define _mm_blend_pd(a, b, imm)                                              \
+    _sse2neon_define2(                                                       \
+        __m128d, a, b,                                                       \
+        const uint64_t _mask[2] =                                            \
+            _sse2neon_init(((imm) & (1 << 0)) ? ~UINT64_C(0) : UINT64_C(0),  \
+                           ((imm) & (1 << 1)) ? ~UINT64_C(0) : UINT64_C(0)); \
+        uint64x2_t _mask_vec = vld1q_u64(_mask);                             \
+        uint64x2_t __a = vreinterpretq_u64_m128d(_a);                        \
+        uint64x2_t __b = vreinterpretq_u64_m128d(_b); _sse2neon_return(      \
+            vreinterpretq_m128d_u64(vbslq_u64(_mask_vec, __b, __a)));)
+
+// Blend packed single-precision (32-bit) floating-point elements from a and b
+// using mask, and store the results in dst.
+// https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_blend_ps
+FORCE_INLINE __m128 _mm_blend_ps(__m128 _a, __m128 _b, const char imm8)
+{
+    const uint32_t ALIGN_STRUCT(16)
+        data[4] = {((imm8) & (1 << 0)) ? UINT32_MAX : 0,
+                   ((imm8) & (1 << 1)) ? UINT32_MAX : 0,
+                   ((imm8) & (1 << 2)) ? UINT32_MAX : 0,
+                   ((imm8) & (1 << 3)) ? UINT32_MAX : 0};
+    uint32x4_t mask = vld1q_u32(data);
+    float32x4_t a = vreinterpretq_f32_m128(_a);
+    float32x4_t b = vreinterpretq_f32_m128(_b);
+    return vreinterpretq_m128_f32(vbslq_f32(mask, b, a));
+}
+
+// Blend packed 8-bit integers from a and b using mask, and store the results in
+// dst.
+// https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_blendv_epi8
+FORCE_INLINE __m128i _mm_blendv_epi8(__m128i _a, __m128i _b, __m128i _mask)
+{
+    // Use a signed shift right to create a mask with the sign bit
+    uint8x16_t mask =
+        vreinterpretq_u8_s8(vshrq_n_s8(vreinterpretq_s8_m128i(_mask), 7));
+    uint8x16_t a = vreinterpretq_u8_m128i(_a);
+    uint8x16_t b = vreinterpretq_u8_m128i(_b);
+    return vreinterpretq_m128i_u8(vbslq_u8(mask, b, a));
+}
+
+// Blend packed double-precision (64-bit) floating-point elements from a and b
+// using mask, and store the results in dst.
+// https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_blendv_pd
+FORCE_INLINE __m128d _mm_blendv_pd(__m128d _a, __m128d _b, __m128d _mask)
+{
+    uint64x2_t mask =
+        vreinterpretq_u64_s64(vshrq_n_s64(vreinterpretq_s64_m128d(_mask), 63));
+#if defined(__aarch64__) || defined(_M_ARM64)
+    float64x2_t a = vreinterpretq_f64_m128d(_a);
+    float64x2_t b = vreinterpretq_f64_m128d(_b);
+    return vreinterpretq_m128d_f64(vbslq_f64(mask, b, a));
+#else
+    uint64x2_t a = vreinterpretq_u64_m128d(_a);
+    uint64x2_t b = vreinterpretq_u64_m128d(_b);
+    return vreinterpretq_m128d_u64(vbslq_u64(mask, b, a));
+#endif
+}
+
+// Blend packed single-precision (32-bit) floating-point elements from a and b
+// using mask, and store the results in dst.
+// https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_blendv_ps
+FORCE_INLINE __m128 _mm_blendv_ps(__m128 _a, __m128 _b, __m128 _mask)
+{
+    // Use a signed shift right to create a mask with the sign bit
+    uint32x4_t mask =
+        vreinterpretq_u32_s32(vshrq_n_s32(vreinterpretq_s32_m128(_mask), 31));
+    float32x4_t a = vreinterpretq_f32_m128(_a);
+    float32x4_t b = vreinterpretq_f32_m128(_b);
+    return vreinterpretq_m128_f32(vbslq_f32(mask, b, a));
+}
+
+// Round the packed double-precision (64-bit) floating-point elements in a up
+// to an integer value, and store the results as packed double-precision
+// floating-point elements in dst.
+// https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_ceil_pd
+FORCE_INLINE __m128d _mm_ceil_pd(__m128d a)
+{
+#if defined(__aarch64__) || defined(_M_ARM64)
+    return vreinterpretq_m128d_f64(vrndpq_f64(vreinterpretq_f64_m128d(a)));
+#else
+    double a0, a1;
+    a0 = sse2neon_recast_u64_f64(vgetq_lane_u64(vreinterpretq_u64_m128d(a), 0));
+    a1 = sse2neon_recast_u64_f64(vgetq_lane_u64(vreinterpretq_u64_m128d(a), 1));
+    return _mm_set_pd(ceil(a1), ceil(a0));
+#endif
+}
+
+// Round the packed single-precision (32-bit) floating-point elements in a up to
+// an integer value, and store the results as packed single-precision
+// floating-point elements in dst.
+// https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_ceil_ps
+FORCE_INLINE __m128 _mm_ceil_ps(__m128 a)
+{
+#if (defined(__aarch64__) || defined(_M_ARM64)) || \
+    defined(__ARM_FEATURE_DIRECTED_ROUNDING)
+    return vreinterpretq_m128_f32(vrndpq_f32(vreinterpretq_f32_m128(a)));
+#else
+    float *f = (float *) &a;
+    return _mm_set_ps(ceilf(f[3]), ceilf(f[2]), ceilf(f[1]), ceilf(f[0]));
+#endif
+}
+
+// Round the lower double-precision (64-bit) floating-point element in b up to
+// an integer value, store the result as a double-precision floating-point
+// element in the lower element of dst, and copy the upper element from a to the
+// upper element of dst.
+// https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_ceil_sd
+FORCE_INLINE __m128d _mm_ceil_sd(__m128d a, __m128d b)
+{
+    return _mm_move_sd(a, _mm_ceil_pd(b));
+}
+
+// Round the lower single-precision (32-bit) floating-point element in b up to
+// an integer value, store the result as a single-precision floating-point
+// element in the lower element of dst, and copy the upper 3 packed elements
+// from a to the upper elements of dst.
+// https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_ceil_ss
+FORCE_INLINE __m128 _mm_ceil_ss(__m128 a, __m128 b)
+{
+    return _mm_move_ss(a, _mm_ceil_ps(b));
+}
+
+// Compare packed 64-bit integers in a and b for equality, and store the results
+// in dst
+FORCE_INLINE __m128i _mm_cmpeq_epi64(__m128i a, __m128i b)
+{
+#if defined(__aarch64__) || defined(_M_ARM64)
+    return vreinterpretq_m128i_u64(
+        vceqq_u64(vreinterpretq_u64_m128i(a), vreinterpretq_u64_m128i(b)));
+#else
+    // ARMv7 lacks vceqq_u64
+    // (a == b) -> (a_lo == b_lo) && (a_hi == b_hi)
+    uint32x4_t cmp =
+        vceqq_u32(vreinterpretq_u32_m128i(a), vreinterpretq_u32_m128i(b));
+    uint32x4_t swapped = vrev64q_u32(cmp);
+    return vreinterpretq_m128i_u32(vandq_u32(cmp, swapped));
+#endif
+}
+
+// Sign extend packed 16-bit integers in a to packed 32-bit integers, and store
+// the results in dst.
+// https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_cvtepi16_epi32
+FORCE_INLINE __m128i _mm_cvtepi16_epi32(__m128i a)
+{
+    return vreinterpretq_m128i_s32(
+        vmovl_s16(vget_low_s16(vreinterpretq_s16_m128i(a))));
+}
+
+// Sign extend packed 16-bit integers in a to packed 64-bit integers, and store
+// the results in dst.
+// https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_cvtepi16_epi64
+FORCE_INLINE __m128i _mm_cvtepi16_epi64(__m128i a)
+{
+    int16x8_t s16x8 = vreinterpretq_s16_m128i(a);     /* xxxx xxxx xxxx 0B0A */
+    int32x4_t s32x4 = vmovl_s16(vget_low_s16(s16x8)); /* 000x 000x 000B 000A */
+    int64x2_t s64x2 = vmovl_s32(vget_low_s32(s32x4)); /* 0000 000B 0000 000A */
+    return vreinterpretq_m128i_s64(s64x2);
+}
+
+// Sign extend packed 32-bit integers in a to packed 64-bit integers, and store
+// the results in dst.
+// https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_cvtepi32_epi64
+FORCE_INLINE __m128i _mm_cvtepi32_epi64(__m128i a)
+{
+    return vreinterpretq_m128i_s64(
+        vmovl_s32(vget_low_s32(vreinterpretq_s32_m128i(a))));
+}
+
+// Sign extend packed 8-bit integers in a to packed 16-bit integers, and store
+// the results in dst.
+// https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_cvtepi8_epi16
+FORCE_INLINE __m128i _mm_cvtepi8_epi16(__m128i a)
+{
+    int8x16_t s8x16 = vreinterpretq_s8_m128i(a);    /* xxxx xxxx xxxx DCBA */
+    int16x8_t s16x8 = vmovl_s8(vget_low_s8(s8x16)); /* 0x0x 0x0x 0D0C 0B0A */
+    return vreinterpretq_m128i_s16(s16x8);
+}
+
+// Sign extend packed 8-bit integers in a to packed 32-bit integers, and store
+// the results in dst.
+// https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_cvtepi8_epi32
+FORCE_INLINE __m128i _mm_cvtepi8_epi32(__m128i a)
+{
+    int8x16_t s8x16 = vreinterpretq_s8_m128i(a);      /* xxxx xxxx xxxx DCBA */
+    int16x8_t s16x8 = vmovl_s8(vget_low_s8(s8x16));   /* 0x0x 0x0x 0D0C 0B0A */
+    int32x4_t s32x4 = vmovl_s16(vget_low_s16(s16x8)); /* 000D 000C 000B 000A */
+    return vreinterpretq_m128i_s32(s32x4);
+}
+
+// Sign extend packed 8-bit integers in the low 8 bytes of a to packed 64-bit
+// integers, and store the results in dst.
+// https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_cvtepi8_epi64
+FORCE_INLINE __m128i _mm_cvtepi8_epi64(__m128i a)
+{
+    int8x16_t s8x16 = vreinterpretq_s8_m128i(a);      /* xxxx xxxx xxxx xxBA */
+    int16x8_t s16x8 = vmovl_s8(vget_low_s8(s8x16));   /* 0x0x 0x0x 0x0x 0B0A */
+    int32x4_t s32x4 = vmovl_s16(vget_low_s16(s16x8)); /* 000x 000x 000B 000A */
+    int64x2_t s64x2 = vmovl_s32(vget_low_s32(s32x4)); /* 0000 000B 0000 000A */
+    return vreinterpretq_m128i_s64(s64x2);
+}
+
+// Zero extend packed unsigned 16-bit integers in a to packed 32-bit integers,
+// and store the results in dst.
+// https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_cvtepu16_epi32
+FORCE_INLINE __m128i _mm_cvtepu16_epi32(__m128i a)
+{
+    return vreinterpretq_m128i_u32(
+        vmovl_u16(vget_low_u16(vreinterpretq_u16_m128i(a))));
+}
+
+// Zero extend packed unsigned 16-bit integers in a to packed 64-bit integers,
+// and store the results in dst.
+// https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_cvtepu16_epi64
+FORCE_INLINE __m128i _mm_cvtepu16_epi64(__m128i a)
+{
+    uint16x8_t u16x8 = vreinterpretq_u16_m128i(a);     /* xxxx xxxx xxxx 0B0A */
+    uint32x4_t u32x4 = vmovl_u16(vget_low_u16(u16x8)); /* 000x 000x 000B 000A */
+    uint64x2_t u64x2 = vmovl_u32(vget_low_u32(u32x4)); /* 0000 000B 0000 000A */
+    return vreinterpretq_m128i_u64(u64x2);
+}
+
+// Zero extend packed unsigned 32-bit integers in a to packed 64-bit integers,
+// and store the results in dst.
+// https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_cvtepu32_epi64
+FORCE_INLINE __m128i _mm_cvtepu32_epi64(__m128i a)
+{
+    return vreinterpretq_m128i_u64(
+        vmovl_u32(vget_low_u32(vreinterpretq_u32_m128i(a))));
+}
+
+// Zero extend packed unsigned 8-bit integers in a to packed 16-bit integers,
+// and store the results in dst.
+// https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_cvtepu8_epi16
+FORCE_INLINE __m128i _mm_cvtepu8_epi16(__m128i a)
+{
+    uint8x16_t u8x16 = vreinterpretq_u8_m128i(a);    /* xxxx xxxx HGFE DCBA */
+    uint16x8_t u16x8 = vmovl_u8(vget_low_u8(u8x16)); /* 0H0G 0F0E 0D0C 0B0A */
+    return vreinterpretq_m128i_u16(u16x8);
+}
+
+// Zero extend packed unsigned 8-bit integers in a to packed 32-bit integers,
+// and store the results in dst.
+// https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_cvtepu8_epi32
+FORCE_INLINE __m128i _mm_cvtepu8_epi32(__m128i a)
+{
+    uint8x16_t u8x16 = vreinterpretq_u8_m128i(a);      /* xxxx xxxx xxxx DCBA */
+    uint16x8_t u16x8 = vmovl_u8(vget_low_u8(u8x16));   /* 0x0x 0x0x 0D0C 0B0A */
+    uint32x4_t u32x4 = vmovl_u16(vget_low_u16(u16x8)); /* 000D 000C 000B 000A */
+    return vreinterpretq_m128i_u32(u32x4);
+}
+
+// Zero extend packed unsigned 8-bit integers in the low 8 bytes of a to packed
+// 64-bit integers, and store the results in dst.
+// https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_cvtepu8_epi64
+FORCE_INLINE __m128i _mm_cvtepu8_epi64(__m128i a)
+{
+    uint8x16_t u8x16 = vreinterpretq_u8_m128i(a);      /* xxxx xxxx xxxx xxBA */
+    uint16x8_t u16x8 = vmovl_u8(vget_low_u8(u8x16));   /* 0x0x 0x0x 0x0x 0B0A */
+    uint32x4_t u32x4 = vmovl_u16(vget_low_u16(u16x8)); /* 000x 000x 000B 000A */
+    uint64x2_t u64x2 = vmovl_u32(vget_low_u32(u32x4)); /* 0000 000B 0000 000A */
+    return vreinterpretq_m128i_u64(u64x2);
+}
+
+// Conditionally multiply the packed double-precision (64-bit) floating-point
+// elements in a and b using the high 4 bits in imm8, sum the four products, and
+// conditionally store the sum in dst using the low 4 bits of imm8.
+// https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_dp_pd
+FORCE_INLINE __m128d _mm_dp_pd(__m128d a, __m128d b, const int imm)
+{
+    // Generate mask value from constant immediate bit value
+    const int64_t bit0Mask = imm & 0x01 ? UINT64_MAX : 0;
+    const int64_t bit1Mask = imm & 0x02 ? UINT64_MAX : 0;
+#if !SSE2NEON_PRECISE_DP
+    const int64_t bit4Mask = imm & 0x10 ? UINT64_MAX : 0;
+    const int64_t bit5Mask = imm & 0x20 ? UINT64_MAX : 0;
+#endif
+    // Conditional multiplication
+#if !SSE2NEON_PRECISE_DP
+    __m128d mul = _mm_mul_pd(a, b);
+    const __m128d mulMask =
+        _mm_castsi128_pd(_mm_set_epi64x(bit5Mask, bit4Mask));
+    __m128d tmp = _mm_and_pd(mul, mulMask);
+#else
+#if defined(__aarch64__) || defined(_M_ARM64)
+    double d0 = (imm & 0x10) ? vgetq_lane_f64(vreinterpretq_f64_m128d(a), 0) *
+                                   vgetq_lane_f64(vreinterpretq_f64_m128d(b), 0)
+                             : 0;
+    double d1 = (imm & 0x20) ? vgetq_lane_f64(vreinterpretq_f64_m128d(a), 1) *
+                                   vgetq_lane_f64(vreinterpretq_f64_m128d(b), 1)
+                             : 0;
+#else
+    double a0 =
+        sse2neon_recast_u64_f64(vgetq_lane_u64(vreinterpretq_u64_m128d(a), 0));
+    double a1 =
+        sse2neon_recast_u64_f64(vgetq_lane_u64(vreinterpretq_u64_m128d(a), 1));
+    double b0 =
+        sse2neon_recast_u64_f64(vgetq_lane_u64(vreinterpretq_u64_m128d(b), 0));
+    double b1 =
+        sse2neon_recast_u64_f64(vgetq_lane_u64(vreinterpretq_u64_m128d(b), 1));
+    double d0 = (imm & 0x10) ? a0 * b0 : 0;
+    double d1 = (imm & 0x20) ? a1 * b1 : 0;
+#endif
+    __m128d tmp = _mm_set_pd(d1, d0);
+#endif
+    // Sum the products
+#if defined(__aarch64__) || defined(_M_ARM64)
+    double sum = vpaddd_f64(vreinterpretq_f64_m128d(tmp));
+#else
+    double _tmp0 = sse2neon_recast_u64_f64(
+        vgetq_lane_u64(vreinterpretq_u64_m128d(tmp), 0));
+    double _tmp1 = sse2neon_recast_u64_f64(
+        vgetq_lane_u64(vreinterpretq_u64_m128d(tmp), 1));
+    double sum = _tmp0 + _tmp1;
+#endif
+    // Conditionally store the sum
+    const __m128d sumMask =
+        _mm_castsi128_pd(_mm_set_epi64x(bit1Mask, bit0Mask));
+    __m128d res = _mm_and_pd(_mm_set_pd1(sum), sumMask);
+    return res;
+}
+
+// Conditionally multiply the packed single-precision (32-bit) floating-point
+// elements in a and b using the high 4 bits in imm8, sum the four products,
+// and conditionally store the sum in dst using the low 4 bits of imm.
+// https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_dp_ps
+FORCE_INLINE __m128 _mm_dp_ps(__m128 a, __m128 b, const int imm)
+{
+    float32x4_t elementwise_prod = _mm_mul_ps(a, b);
+
+#if defined(__aarch64__) || defined(_M_ARM64)
+    /* shortcuts */
+    if (imm == 0xFF) {
+        return _mm_set1_ps(vaddvq_f32(elementwise_prod));
+    }
+
+    if ((imm & 0x0F) == 0x0F) {
+        if (!(imm & (1 << 4)))
+            elementwise_prod = vsetq_lane_f32(0.0f, elementwise_prod, 0);
+        if (!(imm & (1 << 5)))
+            elementwise_prod = vsetq_lane_f32(0.0f, elementwise_prod, 1);
+        if (!(imm & (1 << 6)))
+            elementwise_prod = vsetq_lane_f32(0.0f, elementwise_prod, 2);
+        if (!(imm & (1 << 7)))
+            elementwise_prod = vsetq_lane_f32(0.0f, elementwise_prod, 3);
+
+        return _mm_set1_ps(vaddvq_f32(elementwise_prod));
+    }
+#endif
+
+    float s = 0.0f;
+
+    if (imm & (1 << 4))
+        s += vgetq_lane_f32(elementwise_prod, 0);
+    if (imm & (1 << 5))
+        s += vgetq_lane_f32(elementwise_prod, 1);
+    if (imm & (1 << 6))
+        s += vgetq_lane_f32(elementwise_prod, 2);
+    if (imm & (1 << 7))
+        s += vgetq_lane_f32(elementwise_prod, 3);
+
+    const float32_t res[4] = {
+        (imm & 0x1) ? s : 0.0f,
+        (imm & 0x2) ? s : 0.0f,
+        (imm & 0x4) ? s : 0.0f,
+        (imm & 0x8) ? s : 0.0f,
+    };
+    return vreinterpretq_m128_f32(vld1q_f32(res));
+}
+
+// Extract a 32-bit integer from a, selected with imm8, and store the result in
+// dst.
+// https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_extract_epi32
+// FORCE_INLINE int _mm_extract_epi32(__m128i a, __constrange(0,4) int imm)
+#define _mm_extract_epi32(a, imm) \
+    vgetq_lane_s32(vreinterpretq_s32_m128i(a), (imm))
+
+// Extract a 64-bit integer from a, selected with imm8, and store the result in
+// dst.
+// https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_extract_epi64
+// FORCE_INLINE __int64 _mm_extract_epi64(__m128i a, __constrange(0,2) int imm)
+#define _mm_extract_epi64(a, imm) \
+    vgetq_lane_s64(vreinterpretq_s64_m128i(a), (imm))
+
+// Extract an 8-bit integer from a, selected with imm8, and store the result in
+// the lower element of dst. FORCE_INLINE int _mm_extract_epi8(__m128i a,
+// __constrange(0,16) int imm)
+// https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_extract_epi8
+#define _mm_extract_epi8(a, imm) vgetq_lane_u8(vreinterpretq_u8_m128i(a), (imm))
+
+// Extracts the selected single-precision (32-bit) floating-point from a.
+// FORCE_INLINE int _mm_extract_ps(__m128 a, __constrange(0,4) int imm)
+#define _mm_extract_ps(a, imm) vgetq_lane_s32(vreinterpretq_s32_m128(a), (imm))
+
+// Round the packed double-precision (64-bit) floating-point elements in a down
+// to an integer value, and store the results as packed double-precision
+// floating-point elements in dst.
+// https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_floor_pd
+FORCE_INLINE __m128d _mm_floor_pd(__m128d a)
+{
+#if defined(__aarch64__) || defined(_M_ARM64)
+    return vreinterpretq_m128d_f64(vrndmq_f64(vreinterpretq_f64_m128d(a)));
+#else
+    double a0, a1;
+    a0 = sse2neon_recast_u64_f64(vgetq_lane_u64(vreinterpretq_u64_m128d(a), 0));
+    a1 = sse2neon_recast_u64_f64(vgetq_lane_u64(vreinterpretq_u64_m128d(a), 1));
+    return _mm_set_pd(floor(a1), floor(a0));
+#endif
+}
+
+// Round the packed single-precision (32-bit) floating-point elements in a down
+// to an integer value, and store the results as packed single-precision
+// floating-point elements in dst.
+// https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_floor_ps
+FORCE_INLINE __m128 _mm_floor_ps(__m128 a)
+{
+#if (defined(__aarch64__) || defined(_M_ARM64)) || \
+    defined(__ARM_FEATURE_DIRECTED_ROUNDING)
+    return vreinterpretq_m128_f32(vrndmq_f32(vreinterpretq_f32_m128(a)));
+#else
+    float *f = (float *) &a;
+    return _mm_set_ps(floorf(f[3]), floorf(f[2]), floorf(f[1]), floorf(f[0]));
+#endif
+}
+
+// Round the lower double-precision (64-bit) floating-point element in b down to
+// an integer value, store the result as a double-precision floating-point
+// element in the lower element of dst, and copy the upper element from a to the
+// upper element of dst.
+// https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_floor_sd
+FORCE_INLINE __m128d _mm_floor_sd(__m128d a, __m128d b)
+{
+    return _mm_move_sd(a, _mm_floor_pd(b));
+}
+
+// Round the lower single-precision (32-bit) floating-point element in b down to
+// an integer value, store the result as a single-precision floating-point
+// element in the lower element of dst, and copy the upper 3 packed elements
+// from a to the upper elements of dst.
+// https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_floor_ss
+FORCE_INLINE __m128 _mm_floor_ss(__m128 a, __m128 b)
+{
+    return _mm_move_ss(a, _mm_floor_ps(b));
+}
+
+// Copy a to dst, and insert the 32-bit integer i into dst at the location
+// specified by imm8.
+// https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_insert_epi32
+// FORCE_INLINE __m128i _mm_insert_epi32(__m128i a, int b,
+//                                       __constrange(0,4) int imm)
+#define _mm_insert_epi32(a, b, imm) \
+    vreinterpretq_m128i_s32(        \
+        vsetq_lane_s32((b), vreinterpretq_s32_m128i(a), (imm)))
+
+// Copy a to dst, and insert the 64-bit integer i into dst at the location
+// specified by imm8.
+// https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_insert_epi64
+// FORCE_INLINE __m128i _mm_insert_epi64(__m128i a, __int64 b,
+//                                       __constrange(0,2) int imm)
+#define _mm_insert_epi64(a, b, imm) \
+    vreinterpretq_m128i_s64(        \
+        vsetq_lane_s64((b), vreinterpretq_s64_m128i(a), (imm)))
+
+// Copy a to dst, and insert the lower 8-bit integer from i into dst at the
+// location specified by imm8.
+// https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_insert_epi8
+// FORCE_INLINE __m128i _mm_insert_epi8(__m128i a, int b,
+//                                      __constrange(0,16) int imm)
+#define _mm_insert_epi8(a, b, imm) \
+    vreinterpretq_m128i_s8(vsetq_lane_s8((b), vreinterpretq_s8_m128i(a), (imm)))
+
+// Copy a to tmp, then insert a single-precision (32-bit) floating-point
+// element from b into tmp using the control in imm8. Store tmp to dst using
+// the mask in imm8 (elements are zeroed out when the corresponding bit is set).
+// https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=insert_ps
+#define _mm_insert_ps(a, b, imm8)                                            \
+    _sse2neon_define2(                                                       \
+        __m128, a, b,                                                        \
+        float32x4_t tmp1 =                                                   \
+            vsetq_lane_f32(vgetq_lane_f32(_b, (imm8 >> 6) & 0x3),            \
+                           vreinterpretq_f32_m128(_a), 0);                   \
+        float32x4_t tmp2 =                                                   \
+            vsetq_lane_f32(vgetq_lane_f32(tmp1, 0),                          \
+                           vreinterpretq_f32_m128(_a), ((imm8 >> 4) & 0x3)); \
+        const uint32_t data[4] =                                             \
+            _sse2neon_init(((imm8) & (1 << 0)) ? UINT32_MAX : 0,             \
+                           ((imm8) & (1 << 1)) ? UINT32_MAX : 0,             \
+                           ((imm8) & (1 << 2)) ? UINT32_MAX : 0,             \
+                           ((imm8) & (1 << 3)) ? UINT32_MAX : 0);            \
+        uint32x4_t mask = vld1q_u32(data);                                   \
+        float32x4_t all_zeros = vdupq_n_f32(0);                              \
+                                                                             \
+        _sse2neon_return(vreinterpretq_m128_f32(                             \
+            vbslq_f32(mask, all_zeros, vreinterpretq_f32_m128(tmp2))));)
+
+// Compare packed signed 32-bit integers in a and b, and store packed maximum
+// values in dst.
+// https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_max_epi32
+FORCE_INLINE __m128i _mm_max_epi32(__m128i a, __m128i b)
+{
+    return vreinterpretq_m128i_s32(
+        vmaxq_s32(vreinterpretq_s32_m128i(a), vreinterpretq_s32_m128i(b)));
+}
+
+// Compare packed signed 8-bit integers in a and b, and store packed maximum
+// values in dst.
+// https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_max_epi8
+FORCE_INLINE __m128i _mm_max_epi8(__m128i a, __m128i b)
+{
+    return vreinterpretq_m128i_s8(
+        vmaxq_s8(vreinterpretq_s8_m128i(a), vreinterpretq_s8_m128i(b)));
+}
+
+// Compare packed unsigned 16-bit integers in a and b, and store packed maximum
+// values in dst.
+// https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_max_epu16
+FORCE_INLINE __m128i _mm_max_epu16(__m128i a, __m128i b)
+{
+    return vreinterpretq_m128i_u16(
+        vmaxq_u16(vreinterpretq_u16_m128i(a), vreinterpretq_u16_m128i(b)));
+}
+
+// Compare packed unsigned 32-bit integers in a and b, and store packed maximum
+// values in dst.
+// https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_max_epu32
+FORCE_INLINE __m128i _mm_max_epu32(__m128i a, __m128i b)
+{
+    return vreinterpretq_m128i_u32(
+        vmaxq_u32(vreinterpretq_u32_m128i(a), vreinterpretq_u32_m128i(b)));
+}
+
+// Compare packed signed 32-bit integers in a and b, and store packed minimum
+// values in dst.
+// https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_min_epi32
+FORCE_INLINE __m128i _mm_min_epi32(__m128i a, __m128i b)
+{
+    return vreinterpretq_m128i_s32(
+        vminq_s32(vreinterpretq_s32_m128i(a), vreinterpretq_s32_m128i(b)));
+}
+
+// Compare packed signed 8-bit integers in a and b, and store packed minimum
+// values in dst.
+// https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_min_epi8
+FORCE_INLINE __m128i _mm_min_epi8(__m128i a, __m128i b)
+{
+    return vreinterpretq_m128i_s8(
+        vminq_s8(vreinterpretq_s8_m128i(a), vreinterpretq_s8_m128i(b)));
+}
+
+// Compare packed unsigned 16-bit integers in a and b, and store packed minimum
+// values in dst.
+// https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_min_epu16
+FORCE_INLINE __m128i _mm_min_epu16(__m128i a, __m128i b)
+{
+    return vreinterpretq_m128i_u16(
+        vminq_u16(vreinterpretq_u16_m128i(a), vreinterpretq_u16_m128i(b)));
+}
+
+// Compare packed unsigned 32-bit integers in a and b, and store packed minimum
+// values in dst.
+// https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_max_epu32
+FORCE_INLINE __m128i _mm_min_epu32(__m128i a, __m128i b)
+{
+    return vreinterpretq_m128i_u32(
+        vminq_u32(vreinterpretq_u32_m128i(a), vreinterpretq_u32_m128i(b)));
+}
+
+// Horizontally compute the minimum amongst the packed unsigned 16-bit integers
+// in a, store the minimum and index in dst, and zero the remaining bits in dst.
+// https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_minpos_epu16
+FORCE_INLINE __m128i _mm_minpos_epu16(__m128i a)
+{
+    __m128i dst;
+    uint16_t min, idx = 0;
+#if defined(__aarch64__) || defined(_M_ARM64)
+    // Find the minimum value
+    min = vminvq_u16(vreinterpretq_u16_m128i(a));
+
+    // Get the index of the minimum value
+    static const uint16_t idxv[] = {0, 1, 2, 3, 4, 5, 6, 7};
+    uint16x8_t minv = vdupq_n_u16(min);
+    uint16x8_t cmeq = vceqq_u16(minv, vreinterpretq_u16_m128i(a));
+    idx = vminvq_u16(vornq_u16(vld1q_u16(idxv), cmeq));
+#else
+    // Find the minimum value
+    __m64 tmp;
+    tmp = vreinterpret_m64_u16(
+        vmin_u16(vget_low_u16(vreinterpretq_u16_m128i(a)),
+                 vget_high_u16(vreinterpretq_u16_m128i(a))));
+    tmp = vreinterpret_m64_u16(
+        vpmin_u16(vreinterpret_u16_m64(tmp), vreinterpret_u16_m64(tmp)));
+    tmp = vreinterpret_m64_u16(
+        vpmin_u16(vreinterpret_u16_m64(tmp), vreinterpret_u16_m64(tmp)));
+    min = vget_lane_u16(vreinterpret_u16_m64(tmp), 0);
+    // Get the index of the minimum value
+    int i;
+    for (i = 0; i < 8; i++) {
+        if (min == vgetq_lane_u16(vreinterpretq_u16_m128i(a), 0)) {
+            idx = (uint16_t) i;
+            break;
+        }
+        a = _mm_srli_si128(a, 2);
+    }
+#endif
+    // Generate result
+    dst = _mm_setzero_si128();
+    dst = vreinterpretq_m128i_u16(
+        vsetq_lane_u16(min, vreinterpretq_u16_m128i(dst), 0));
+    dst = vreinterpretq_m128i_u16(
+        vsetq_lane_u16(idx, vreinterpretq_u16_m128i(dst), 1));
+    return dst;
+}
+
+// Compute the sum of absolute differences (SADs) of quadruplets of unsigned
+// 8-bit integers in a compared to those in b, and store the 16-bit results in
+// dst. Eight SADs are performed using one quadruplet from b and eight
+// quadruplets from a. One quadruplet is selected from b starting at on the
+// offset specified in imm8. Eight quadruplets are formed from sequential 8-bit
+// integers selected from a starting at the offset specified in imm8.
+// https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_mpsadbw_epu8
+FORCE_INLINE __m128i _mm_mpsadbw_epu8(__m128i a, __m128i b, const int imm)
+{
+    uint8x16_t _a, _b;
+
+    switch (imm & 0x4) {
+    case 0:
+        // do nothing
+        _a = vreinterpretq_u8_m128i(a);
+        break;
+    case 4:
+        _a = vreinterpretq_u8_u32(vextq_u32(vreinterpretq_u32_m128i(a),
+                                            vreinterpretq_u32_m128i(a), 1));
+        break;
+    default:
+#if defined(__GNUC__) || defined(__clang__)
+        __builtin_unreachable();
+#elif defined(_MSC_VER)
+        __assume(0);
+#endif
+        break;
+    }
+
+    switch (imm & 0x3) {
+    case 0:
+        _b = vreinterpretq_u8_u32(
+            vdupq_n_u32(vgetq_lane_u32(vreinterpretq_u32_m128i(b), 0)));
+        break;
+    case 1:
+        _b = vreinterpretq_u8_u32(
+            vdupq_n_u32(vgetq_lane_u32(vreinterpretq_u32_m128i(b), 1)));
+        break;
+    case 2:
+        _b = vreinterpretq_u8_u32(
+            vdupq_n_u32(vgetq_lane_u32(vreinterpretq_u32_m128i(b), 2)));
+        break;
+    case 3:
+        _b = vreinterpretq_u8_u32(
+            vdupq_n_u32(vgetq_lane_u32(vreinterpretq_u32_m128i(b), 3)));
+        break;
+    default:
+#if defined(__GNUC__) || defined(__clang__)
+        __builtin_unreachable();
+#elif defined(_MSC_VER)
+        __assume(0);
+#endif
+        break;
+    }
+
+    int16x8_t c04, c15, c26, c37;
+    uint8x8_t low_b = vget_low_u8(_b);
+    c04 = vreinterpretq_s16_u16(vabdl_u8(vget_low_u8(_a), low_b));
+    uint8x16_t _a_1 = vextq_u8(_a, _a, 1);
+    c15 = vreinterpretq_s16_u16(vabdl_u8(vget_low_u8(_a_1), low_b));
+    uint8x16_t _a_2 = vextq_u8(_a, _a, 2);
+    c26 = vreinterpretq_s16_u16(vabdl_u8(vget_low_u8(_a_2), low_b));
+    uint8x16_t _a_3 = vextq_u8(_a, _a, 3);
+    c37 = vreinterpretq_s16_u16(vabdl_u8(vget_low_u8(_a_3), low_b));
+#if defined(__aarch64__) || defined(_M_ARM64)
+    // |0|4|2|6|
+    c04 = vpaddq_s16(c04, c26);
+    // |1|5|3|7|
+    c15 = vpaddq_s16(c15, c37);
+
+    int32x4_t trn1_c =
+        vtrn1q_s32(vreinterpretq_s32_s16(c04), vreinterpretq_s32_s16(c15));
+    int32x4_t trn2_c =
+        vtrn2q_s32(vreinterpretq_s32_s16(c04), vreinterpretq_s32_s16(c15));
+    return vreinterpretq_m128i_s16(vpaddq_s16(vreinterpretq_s16_s32(trn1_c),
+                                              vreinterpretq_s16_s32(trn2_c)));
+#else
+    int16x4_t c01, c23, c45, c67;
+    c01 = vpadd_s16(vget_low_s16(c04), vget_low_s16(c15));
+    c23 = vpadd_s16(vget_low_s16(c26), vget_low_s16(c37));
+    c45 = vpadd_s16(vget_high_s16(c04), vget_high_s16(c15));
+    c67 = vpadd_s16(vget_high_s16(c26), vget_high_s16(c37));
+
+    return vreinterpretq_m128i_s16(
+        vcombine_s16(vpadd_s16(c01, c23), vpadd_s16(c45, c67)));
+#endif
+}
+
+// Multiply the low signed 32-bit integers from each packed 64-bit element in
+// a and b, and store the signed 64-bit results in dst.
+// https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_mul_epi32
+FORCE_INLINE __m128i _mm_mul_epi32(__m128i a, __m128i b)
+{
+    // vmull_s32 upcasts instead of masking, so we downcast.
+    int32x2_t a_lo = vmovn_s64(vreinterpretq_s64_m128i(a));
+    int32x2_t b_lo = vmovn_s64(vreinterpretq_s64_m128i(b));
+    return vreinterpretq_m128i_s64(vmull_s32(a_lo, b_lo));
+}
+
+// Multiply the packed 32-bit integers in a and b, producing intermediate 64-bit
+// integers, and store the low 32 bits of the intermediate integers in dst.
+// https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_mullo_epi32
+FORCE_INLINE __m128i _mm_mullo_epi32(__m128i a, __m128i b)
+{
+    return vreinterpretq_m128i_s32(
+        vmulq_s32(vreinterpretq_s32_m128i(a), vreinterpretq_s32_m128i(b)));
+}
+
+// Convert packed signed 32-bit integers from a and b to packed 16-bit integers
+// using unsigned saturation, and store the results in dst.
+// https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_packus_epi32
+FORCE_INLINE __m128i _mm_packus_epi32(__m128i a, __m128i b)
+{
+    return vreinterpretq_m128i_u16(
+        vcombine_u16(vqmovun_s32(vreinterpretq_s32_m128i(a)),
+                     vqmovun_s32(vreinterpretq_s32_m128i(b))));
+}
+
+// Round the packed double-precision (64-bit) floating-point elements in a using
+// the rounding parameter, and store the results as packed double-precision
+// floating-point elements in dst.
+// https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_round_pd
+FORCE_INLINE __m128d _mm_round_pd(__m128d a, int rounding)
+{
+#if defined(__aarch64__) || defined(_M_ARM64)
+    switch (rounding) {
+    case (_MM_FROUND_TO_NEAREST_INT | _MM_FROUND_NO_EXC):
+        return vreinterpretq_m128d_f64(vrndnq_f64(vreinterpretq_f64_m128d(a)));
+    case (_MM_FROUND_TO_NEG_INF | _MM_FROUND_NO_EXC):
+        return _mm_floor_pd(a);
+    case (_MM_FROUND_TO_POS_INF | _MM_FROUND_NO_EXC):
+        return _mm_ceil_pd(a);
+    case (_MM_FROUND_TO_ZERO | _MM_FROUND_NO_EXC):
+        return vreinterpretq_m128d_f64(vrndq_f64(vreinterpretq_f64_m128d(a)));
+    default:  //_MM_FROUND_CUR_DIRECTION
+        return vreinterpretq_m128d_f64(vrndiq_f64(vreinterpretq_f64_m128d(a)));
+    }
+#else
+    double *v_double = (double *) &a;
+
+    if (rounding == (_MM_FROUND_TO_NEAREST_INT | _MM_FROUND_NO_EXC) ||
+        (rounding == _MM_FROUND_CUR_DIRECTION &&
+         _MM_GET_ROUNDING_MODE() == _MM_ROUND_NEAREST)) {
+        double res[2], tmp;
+        for (int i = 0; i < 2; i++) {
+            tmp = (v_double[i] < 0) ? -v_double[i] : v_double[i];
+            double roundDown = floor(tmp);  // Round down value
+            double roundUp = ceil(tmp);     // Round up value
+            double diffDown = tmp - roundDown;
+            double diffUp = roundUp - tmp;
+            if (diffDown < diffUp) {
+                /* If it's closer to the round down value, then use it */
+                res[i] = roundDown;
+            } else if (diffDown > diffUp) {
+                /* If it's closer to the round up value, then use it */
+                res[i] = roundUp;
+            } else {
+                /* If it's equidistant between round up and round down value,
+                 * pick the one which is an even number */
+                double half = roundDown / 2;
+                if (half != floor(half)) {
+                    /* If the round down value is odd, return the round up value
+                     */
+                    res[i] = roundUp;
+                } else {
+                    /* If the round up value is odd, return the round down value
+                     */
+                    res[i] = roundDown;
+                }
+            }
+            res[i] = (v_double[i] < 0) ? -res[i] : res[i];
+        }
+        return _mm_set_pd(res[1], res[0]);
+    } else if (rounding == (_MM_FROUND_TO_NEG_INF | _MM_FROUND_NO_EXC) ||
+               (rounding == _MM_FROUND_CUR_DIRECTION &&
+                _MM_GET_ROUNDING_MODE() == _MM_ROUND_DOWN)) {
+        return _mm_floor_pd(a);
+    } else if (rounding == (_MM_FROUND_TO_POS_INF | _MM_FROUND_NO_EXC) ||
+               (rounding == _MM_FROUND_CUR_DIRECTION &&
+                _MM_GET_ROUNDING_MODE() == _MM_ROUND_UP)) {
+        return _mm_ceil_pd(a);
+    }
+    return _mm_set_pd(v_double[1] > 0 ? floor(v_double[1]) : ceil(v_double[1]),
+                      v_double[0] > 0 ? floor(v_double[0]) : ceil(v_double[0]));
+#endif
+}
+
+// Round the packed single-precision (32-bit) floating-point elements in a using
+// the rounding parameter, and store the results as packed single-precision
+// floating-point elements in dst.
+// software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm_round_ps
+FORCE_INLINE __m128 _mm_round_ps(__m128 a, int rounding)
+{
+#if (defined(__aarch64__) || defined(_M_ARM64)) || \
+    defined(__ARM_FEATURE_DIRECTED_ROUNDING)
+    switch (rounding) {
+    case (_MM_FROUND_TO_NEAREST_INT | _MM_FROUND_NO_EXC):
+        return vreinterpretq_m128_f32(vrndnq_f32(vreinterpretq_f32_m128(a)));
+    case (_MM_FROUND_TO_NEG_INF | _MM_FROUND_NO_EXC):
+        return _mm_floor_ps(a);
+    case (_MM_FROUND_TO_POS_INF | _MM_FROUND_NO_EXC):
+        return _mm_ceil_ps(a);
+    case (_MM_FROUND_TO_ZERO | _MM_FROUND_NO_EXC):
+        return vreinterpretq_m128_f32(vrndq_f32(vreinterpretq_f32_m128(a)));
+    default:  //_MM_FROUND_CUR_DIRECTION
+        return vreinterpretq_m128_f32(vrndiq_f32(vreinterpretq_f32_m128(a)));
+    }
+#else
+    float *v_float = (float *) &a;
+
+    if (rounding == (_MM_FROUND_TO_NEAREST_INT | _MM_FROUND_NO_EXC) ||
+        (rounding == _MM_FROUND_CUR_DIRECTION &&
+         _MM_GET_ROUNDING_MODE() == _MM_ROUND_NEAREST)) {
+        uint32x4_t signmask = vdupq_n_u32(0x80000000);
+        float32x4_t half = vbslq_f32(signmask, vreinterpretq_f32_m128(a),
+                                     vdupq_n_f32(0.5f)); /* +/- 0.5 */
+        int32x4_t r_normal = vcvtq_s32_f32(vaddq_f32(
+            vreinterpretq_f32_m128(a), half)); /* round to integer: [a + 0.5]*/
+        int32x4_t r_trunc = vcvtq_s32_f32(
+            vreinterpretq_f32_m128(a)); /* truncate to integer: [a] */
+        int32x4_t plusone = vreinterpretq_s32_u32(vshrq_n_u32(
+            vreinterpretq_u32_s32(vnegq_s32(r_trunc)), 31)); /* 1 or 0 */
+        int32x4_t r_even = vbicq_s32(vaddq_s32(r_trunc, plusone),
+                                     vdupq_n_s32(1)); /* ([a] + {0,1}) & ~1 */
+        float32x4_t delta = vsubq_f32(
+            vreinterpretq_f32_m128(a),
+            vcvtq_f32_s32(r_trunc)); /* compute delta: delta = (a - [a]) */
+        uint32x4_t is_delta_half =
+            vceqq_f32(delta, half); /* delta == +/- 0.5 */
+        return vreinterpretq_m128_f32(
+            vcvtq_f32_s32(vbslq_s32(is_delta_half, r_even, r_normal)));
+    } else if (rounding == (_MM_FROUND_TO_NEG_INF | _MM_FROUND_NO_EXC) ||
+               (rounding == _MM_FROUND_CUR_DIRECTION &&
+                _MM_GET_ROUNDING_MODE() == _MM_ROUND_DOWN)) {
+        return _mm_floor_ps(a);
+    } else if (rounding == (_MM_FROUND_TO_POS_INF | _MM_FROUND_NO_EXC) ||
+               (rounding == _MM_FROUND_CUR_DIRECTION &&
+                _MM_GET_ROUNDING_MODE() == _MM_ROUND_UP)) {
+        return _mm_ceil_ps(a);
+    }
+    return _mm_set_ps(v_float[3] > 0 ? floorf(v_float[3]) : ceilf(v_float[3]),
+                      v_float[2] > 0 ? floorf(v_float[2]) : ceilf(v_float[2]),
+                      v_float[1] > 0 ? floorf(v_float[1]) : ceilf(v_float[1]),
+                      v_float[0] > 0 ? floorf(v_float[0]) : ceilf(v_float[0]));
+#endif
+}
+
+// Round the lower double-precision (64-bit) floating-point element in b using
+// the rounding parameter, store the result as a double-precision floating-point
+// element in the lower element of dst, and copy the upper element from a to the
+// upper element of dst.
+// https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_round_sd
+FORCE_INLINE __m128d _mm_round_sd(__m128d a, __m128d b, int rounding)
+{
+    return _mm_move_sd(a, _mm_round_pd(b, rounding));
+}
+
+// Round the lower single-precision (32-bit) floating-point element in b using
+// the rounding parameter, store the result as a single-precision floating-point
+// element in the lower element of dst, and copy the upper 3 packed elements
+// from a to the upper elements of dst. Rounding is done according to the
+// rounding[3:0] parameter, which can be one of:
+//     (_MM_FROUND_TO_NEAREST_INT |_MM_FROUND_NO_EXC) // round to nearest, and
+//     suppress exceptions
+//     (_MM_FROUND_TO_NEG_INF |_MM_FROUND_NO_EXC)     // round down, and
+//     suppress exceptions
+//     (_MM_FROUND_TO_POS_INF |_MM_FROUND_NO_EXC)     // round up, and suppress
+//     exceptions
+//     (_MM_FROUND_TO_ZERO |_MM_FROUND_NO_EXC)        // truncate, and suppress
+//     exceptions _MM_FROUND_CUR_DIRECTION // use MXCSR.RC; see
+//     _MM_SET_ROUNDING_MODE
+// https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_round_ss
+FORCE_INLINE __m128 _mm_round_ss(__m128 a, __m128 b, int rounding)
+{
+    return _mm_move_ss(a, _mm_round_ps(b, rounding));
+}
+
+// Load 128-bits of integer data from memory into dst using a non-temporal
+// memory hint. mem_addr must be aligned on a 16-byte boundary or a
+// general-protection exception may be generated.
+// https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_stream_load_si128
+FORCE_INLINE __m128i _mm_stream_load_si128(__m128i *p)
+{
+#if __has_builtin(__builtin_nontemporal_store)
+    return __builtin_nontemporal_load(p);
+#else
+    return vreinterpretq_m128i_s64(vld1q_s64((int64_t *) p));
+#endif
+}
+
+// Compute the bitwise NOT of a and then AND with a 128-bit vector containing
+// all 1's, and return 1 if the result is zero, otherwise return 0.
+// https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_test_all_ones
+FORCE_INLINE int _mm_test_all_ones(__m128i a)
+{
+    return (uint64_t) (vgetq_lane_s64(a, 0) & vgetq_lane_s64(a, 1)) ==
+           ~(uint64_t) 0;
+}
+
+// Compute the bitwise AND of 128 bits (representing integer data) in a and
+// mask, and return 1 if the result is zero, otherwise return 0.
+// https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_test_all_zeros
+FORCE_INLINE int _mm_test_all_zeros(__m128i a, __m128i mask)
+{
+    int64x2_t a_and_mask =
+        vandq_s64(vreinterpretq_s64_m128i(a), vreinterpretq_s64_m128i(mask));
+    return !(vgetq_lane_s64(a_and_mask, 0) | vgetq_lane_s64(a_and_mask, 1));
+}
+
+// Compute the bitwise AND of 128 bits (representing integer data) in a and
+// mask, and set ZF to 1 if the result is zero, otherwise set ZF to 0. Compute
+// the bitwise NOT of a and then AND with mask, and set CF to 1 if the result is
+// zero, otherwise set CF to 0. Return 1 if both the ZF and CF values are zero,
+// otherwise return 0.
+// https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=mm_test_mix_ones_zero
+// Note: Argument names may be wrong in the Intel intrinsics guide.
+FORCE_INLINE int _mm_test_mix_ones_zeros(__m128i a, __m128i mask)
+{
+    uint64x2_t v = vreinterpretq_u64_m128i(a);
+    uint64x2_t m = vreinterpretq_u64_m128i(mask);
+
+    // find ones (set-bits) and zeros (clear-bits) under clip mask
+    uint64x2_t ones = vandq_u64(m, v);
+    uint64x2_t zeros = vbicq_u64(m, v);
+
+    // If both 128-bit variables are populated (non-zero) then return 1.
+    // For comparison purposes, first compact each var down to 32-bits.
+    uint32x2_t reduced = vpmax_u32(vqmovn_u64(ones), vqmovn_u64(zeros));
+
+    // if folding minimum is non-zero then both vars must be non-zero
+    return (vget_lane_u32(vpmin_u32(reduced, reduced), 0) != 0);
+}
+
+// Compute the bitwise AND of 128 bits (representing integer data) in a and b,
+// and set ZF to 1 if the result is zero, otherwise set ZF to 0. Compute the
+// bitwise NOT of a and then AND with b, and set CF to 1 if the result is zero,
+// otherwise set CF to 0. Return the CF value.
+// https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_testc_si128
+FORCE_INLINE int _mm_testc_si128(__m128i a, __m128i b)
+{
+    int64x2_t s64 =
+        vbicq_s64(vreinterpretq_s64_m128i(b), vreinterpretq_s64_m128i(a));
+    return !(vgetq_lane_s64(s64, 0) | vgetq_lane_s64(s64, 1));
+}
+
+// Compute the bitwise AND of 128 bits (representing integer data) in a and b,
+// and set ZF to 1 if the result is zero, otherwise set ZF to 0. Compute the
+// bitwise NOT of a and then AND with b, and set CF to 1 if the result is zero,
+// otherwise set CF to 0. Return 1 if both the ZF and CF values are zero,
+// otherwise return 0.
+// https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_testnzc_si128
+#define _mm_testnzc_si128(a, b) _mm_test_mix_ones_zeros(a, b)
+
+// Compute the bitwise AND of 128 bits (representing integer data) in a and b,
+// and set ZF to 1 if the result is zero, otherwise set ZF to 0. Compute the
+// bitwise NOT of a and then AND with b, and set CF to 1 if the result is zero,
+// otherwise set CF to 0. Return the ZF value.
+// https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_testz_si128
+FORCE_INLINE int _mm_testz_si128(__m128i a, __m128i b)
+{
+    int64x2_t s64 =
+        vandq_s64(vreinterpretq_s64_m128i(a), vreinterpretq_s64_m128i(b));
+    return !(vgetq_lane_s64(s64, 0) | vgetq_lane_s64(s64, 1));
+}
+
+/* SSE4.2 */
+
+static const uint16_t ALIGN_STRUCT(16) _sse2neon_cmpestr_mask16b[8] = {
+    0x01, 0x02, 0x04, 0x08, 0x10, 0x20, 0x40, 0x80,
+};
+static const uint8_t ALIGN_STRUCT(16) _sse2neon_cmpestr_mask8b[16] = {
+    0x01, 0x02, 0x04, 0x08, 0x10, 0x20, 0x40, 0x80,
+    0x01, 0x02, 0x04, 0x08, 0x10, 0x20, 0x40, 0x80,
+};
+
+/* specify the source data format */
+#define _SIDD_UBYTE_OPS 0x00 /* unsigned 8-bit characters */
+#define _SIDD_UWORD_OPS 0x01 /* unsigned 16-bit characters */
+#define _SIDD_SBYTE_OPS 0x02 /* signed 8-bit characters */
+#define _SIDD_SWORD_OPS 0x03 /* signed 16-bit characters */
+
+/* specify the comparison operation */
+#define _SIDD_CMP_EQUAL_ANY 0x00     /* compare equal any: strchr */
+#define _SIDD_CMP_RANGES 0x04        /* compare ranges */
+#define _SIDD_CMP_EQUAL_EACH 0x08    /* compare equal each: strcmp */
+#define _SIDD_CMP_EQUAL_ORDERED 0x0C /* compare equal ordered */
+
+/* specify the polarity */
+#define _SIDD_POSITIVE_POLARITY 0x00
+#define _SIDD_MASKED_POSITIVE_POLARITY 0x20
+#define _SIDD_NEGATIVE_POLARITY 0x10 /* negate results */
+#define _SIDD_MASKED_NEGATIVE_POLARITY \
+    0x30 /* negate results only before end of string */
+
+/* specify the output selection in _mm_cmpXstri */
+#define _SIDD_LEAST_SIGNIFICANT 0x00
+#define _SIDD_MOST_SIGNIFICANT 0x40
+
+/* specify the output selection in _mm_cmpXstrm */
+#define _SIDD_BIT_MASK 0x00
+#define _SIDD_UNIT_MASK 0x40
+
+/* Pattern Matching for C macros.
+ * https://github.com/pfultz2/Cloak/wiki/C-Preprocessor-tricks,-tips,-and-idioms
+ */
+
+/* catenate */
+#define SSE2NEON_PRIMITIVE_CAT(a, ...) a##__VA_ARGS__
+#define SSE2NEON_CAT(a, b) SSE2NEON_PRIMITIVE_CAT(a, b)
+
+#define SSE2NEON_IIF(c) SSE2NEON_PRIMITIVE_CAT(SSE2NEON_IIF_, c)
+/* run the 2nd parameter */
+#define SSE2NEON_IIF_0(t, ...) __VA_ARGS__
+/* run the 1st parameter */
+#define SSE2NEON_IIF_1(t, ...) t
+
+#define SSE2NEON_COMPL(b) SSE2NEON_PRIMITIVE_CAT(SSE2NEON_COMPL_, b)
+#define SSE2NEON_COMPL_0 1
+#define SSE2NEON_COMPL_1 0
+
+#define SSE2NEON_DEC(x) SSE2NEON_PRIMITIVE_CAT(SSE2NEON_DEC_, x)
+#define SSE2NEON_DEC_1 0
+#define SSE2NEON_DEC_2 1
+#define SSE2NEON_DEC_3 2
+#define SSE2NEON_DEC_4 3
+#define SSE2NEON_DEC_5 4
+#define SSE2NEON_DEC_6 5
+#define SSE2NEON_DEC_7 6
+#define SSE2NEON_DEC_8 7
+#define SSE2NEON_DEC_9 8
+#define SSE2NEON_DEC_10 9
+#define SSE2NEON_DEC_11 10
+#define SSE2NEON_DEC_12 11
+#define SSE2NEON_DEC_13 12
+#define SSE2NEON_DEC_14 13
+#define SSE2NEON_DEC_15 14
+#define SSE2NEON_DEC_16 15
+
+/* detection */
+#define SSE2NEON_CHECK_N(x, n, ...) n
+#define SSE2NEON_CHECK(...) SSE2NEON_CHECK_N(__VA_ARGS__, 0, )
+#define SSE2NEON_PROBE(x) x, 1,
+
+#define SSE2NEON_NOT(x) SSE2NEON_CHECK(SSE2NEON_PRIMITIVE_CAT(SSE2NEON_NOT_, x))
+#define SSE2NEON_NOT_0 SSE2NEON_PROBE(~)
+
+#define SSE2NEON_BOOL(x) SSE2NEON_COMPL(SSE2NEON_NOT(x))
+#define SSE2NEON_IF(c) SSE2NEON_IIF(SSE2NEON_BOOL(c))
+
+#define SSE2NEON_EAT(...)
+#define SSE2NEON_EXPAND(...) __VA_ARGS__
+#define SSE2NEON_WHEN(c) SSE2NEON_IF(c)(SSE2NEON_EXPAND, SSE2NEON_EAT)
+
+/* recursion */
+/* deferred expression */
+#define SSE2NEON_EMPTY()
+#define SSE2NEON_DEFER(id) id SSE2NEON_EMPTY()
+#define SSE2NEON_OBSTRUCT(...) __VA_ARGS__ SSE2NEON_DEFER(SSE2NEON_EMPTY)()
+#define SSE2NEON_EXPAND(...) __VA_ARGS__
+
+#define SSE2NEON_EVAL(...) \
+    SSE2NEON_EVAL1(SSE2NEON_EVAL1(SSE2NEON_EVAL1(__VA_ARGS__)))
+#define SSE2NEON_EVAL1(...) \
+    SSE2NEON_EVAL2(SSE2NEON_EVAL2(SSE2NEON_EVAL2(__VA_ARGS__)))
+#define SSE2NEON_EVAL2(...) \
+    SSE2NEON_EVAL3(SSE2NEON_EVAL3(SSE2NEON_EVAL3(__VA_ARGS__)))
+#define SSE2NEON_EVAL3(...) __VA_ARGS__
+
+#define SSE2NEON_REPEAT(count, macro, ...)                         \
+    SSE2NEON_WHEN(count)                                           \
+    (SSE2NEON_OBSTRUCT(SSE2NEON_REPEAT_INDIRECT)()(                \
+        SSE2NEON_DEC(count), macro,                                \
+        __VA_ARGS__) SSE2NEON_OBSTRUCT(macro)(SSE2NEON_DEC(count), \
+                                              __VA_ARGS__))
+#define SSE2NEON_REPEAT_INDIRECT() SSE2NEON_REPEAT
+
+#define SSE2NEON_SIZE_OF_byte 8
+#define SSE2NEON_NUMBER_OF_LANES_byte 16
+#define SSE2NEON_SIZE_OF_word 16
+#define SSE2NEON_NUMBER_OF_LANES_word 8
+
+#define SSE2NEON_COMPARE_EQUAL_THEN_FILL_LANE(i, type)                         \
+    mtx[i] = vreinterpretq_m128i_##type(vceqq_##type(                          \
+        vdupq_n_##type(vgetq_lane_##type(vreinterpretq_##type##_m128i(b), i)), \
+        vreinterpretq_##type##_m128i(a)));
+
+#define SSE2NEON_FILL_LANE(i, type) \
+    vec_b[i] =                      \
+        vdupq_n_##type(vgetq_lane_##type(vreinterpretq_##type##_m128i(b), i));
+
+#define PCMPSTR_RANGES(a, b, mtx, data_type_prefix, type_prefix, size,        \
+                       number_of_lanes, byte_or_word)                         \
+    do {                                                                      \
+        SSE2NEON_CAT(                                                         \
+            data_type_prefix,                                                 \
+            SSE2NEON_CAT(size,                                                \
+                         SSE2NEON_CAT(x, SSE2NEON_CAT(number_of_lanes, _t)))) \
+        vec_b[number_of_lanes];                                               \
+        __m128i mask = SSE2NEON_IIF(byte_or_word)(                            \
+            vreinterpretq_m128i_u16(vdupq_n_u16(0xff)),                       \
+            vreinterpretq_m128i_u32(vdupq_n_u32(0xffff)));                    \
+        SSE2NEON_EVAL(SSE2NEON_REPEAT(number_of_lanes, SSE2NEON_FILL_LANE,    \
+                                      SSE2NEON_CAT(type_prefix, size)))       \
+        for (int i = 0; i < number_of_lanes; i++) {                           \
+            mtx[i] = SSE2NEON_CAT(vreinterpretq_m128i_u,                      \
+                                  size)(SSE2NEON_CAT(vbslq_u, size)(          \
+                SSE2NEON_CAT(vreinterpretq_u,                                 \
+                             SSE2NEON_CAT(size, _m128i))(mask),               \
+                SSE2NEON_CAT(vcgeq_, SSE2NEON_CAT(type_prefix, size))(        \
+                    vec_b[i],                                                 \
+                    SSE2NEON_CAT(                                             \
+                        vreinterpretq_,                                       \
+                        SSE2NEON_CAT(type_prefix,                             \
+                                     SSE2NEON_CAT(size, _m128i(a))))),        \
+                SSE2NEON_CAT(vcleq_, SSE2NEON_CAT(type_prefix, size))(        \
+                    vec_b[i],                                                 \
+                    SSE2NEON_CAT(                                             \
+                        vreinterpretq_,                                       \
+                        SSE2NEON_CAT(type_prefix,                             \
+                                     SSE2NEON_CAT(size, _m128i(a)))))));      \
+        }                                                                     \
+    } while (0)
+
+#define PCMPSTR_EQ(a, b, mtx, size, number_of_lanes)                         \
+    do {                                                                     \
+        SSE2NEON_EVAL(SSE2NEON_REPEAT(number_of_lanes,                       \
+                                      SSE2NEON_COMPARE_EQUAL_THEN_FILL_LANE, \
+                                      SSE2NEON_CAT(u, size)))                \
+    } while (0)
+
+#define SSE2NEON_CMP_EQUAL_ANY_IMPL(type)                                     \
+    static int _sse2neon_cmp_##type##_equal_any(__m128i a, int la, __m128i b, \
+                                                int lb)                       \
+    {                                                                         \
+        __m128i mtx[16];                                                      \
+        PCMPSTR_EQ(a, b, mtx, SSE2NEON_CAT(SSE2NEON_SIZE_OF_, type),          \
+                   SSE2NEON_CAT(SSE2NEON_NUMBER_OF_LANES_, type));            \
+        return SSE2NEON_CAT(                                                  \
+            _sse2neon_aggregate_equal_any_,                                   \
+            SSE2NEON_CAT(                                                     \
+                SSE2NEON_CAT(SSE2NEON_SIZE_OF_, type),                        \
+                SSE2NEON_CAT(x, SSE2NEON_CAT(SSE2NEON_NUMBER_OF_LANES_,       \
+                                             type))))(la, lb, mtx);           \
+    }
+
+#define SSE2NEON_CMP_RANGES_IMPL(type, data_type, us, byte_or_word)            \
+    static int _sse2neon_cmp_##us##type##_ranges(__m128i a, int la, __m128i b, \
+                                                 int lb)                       \
+    {                                                                          \
+        __m128i mtx[16];                                                       \
+        PCMPSTR_RANGES(                                                        \
+            a, b, mtx, data_type, us, SSE2NEON_CAT(SSE2NEON_SIZE_OF_, type),   \
+            SSE2NEON_CAT(SSE2NEON_NUMBER_OF_LANES_, type), byte_or_word);      \
+        return SSE2NEON_CAT(                                                   \
+            _sse2neon_aggregate_ranges_,                                       \
+            SSE2NEON_CAT(                                                      \
+                SSE2NEON_CAT(SSE2NEON_SIZE_OF_, type),                         \
+                SSE2NEON_CAT(x, SSE2NEON_CAT(SSE2NEON_NUMBER_OF_LANES_,        \
+                                             type))))(la, lb, mtx);            \
+    }
+
+#define SSE2NEON_CMP_EQUAL_ORDERED_IMPL(type)                                  \
+    static int _sse2neon_cmp_##type##_equal_ordered(__m128i a, int la,         \
+                                                    __m128i b, int lb)         \
+    {                                                                          \
+        __m128i mtx[16];                                                       \
+        PCMPSTR_EQ(a, b, mtx, SSE2NEON_CAT(SSE2NEON_SIZE_OF_, type),           \
+                   SSE2NEON_CAT(SSE2NEON_NUMBER_OF_LANES_, type));             \
+        return SSE2NEON_CAT(                                                   \
+            _sse2neon_aggregate_equal_ordered_,                                \
+            SSE2NEON_CAT(                                                      \
+                SSE2NEON_CAT(SSE2NEON_SIZE_OF_, type),                         \
+                SSE2NEON_CAT(x,                                                \
+                             SSE2NEON_CAT(SSE2NEON_NUMBER_OF_LANES_, type))))( \
+            SSE2NEON_CAT(SSE2NEON_NUMBER_OF_LANES_, type), la, lb, mtx);       \
+    }
+
+static int _sse2neon_aggregate_equal_any_8x16(int la, int lb, __m128i mtx[16])
+{
+    int res = 0;
+    int m = (1 << la) - 1;
+    uint8x8_t vec_mask = vld1_u8(_sse2neon_cmpestr_mask8b);
+    uint8x8_t t_lo = vtst_u8(vdup_n_u8(m & 0xff), vec_mask);
+    uint8x8_t t_hi = vtst_u8(vdup_n_u8(m >> 8), vec_mask);
+    uint8x16_t vec = vcombine_u8(t_lo, t_hi);
+    for (int j = 0; j < lb; j++) {
+        mtx[j] = vreinterpretq_m128i_u8(
+            vandq_u8(vec, vreinterpretq_u8_m128i(mtx[j])));
+        mtx[j] = vreinterpretq_m128i_u8(
+            vshrq_n_u8(vreinterpretq_u8_m128i(mtx[j]), 7));
+        int tmp = _sse2neon_vaddvq_u8(vreinterpretq_u8_m128i(mtx[j])) ? 1 : 0;
+        res |= (tmp << j);
+    }
+    return res;
+}
+
+static int _sse2neon_aggregate_equal_any_16x8(int la, int lb, __m128i mtx[16])
+{
+    int res = 0;
+    int m = (1 << la) - 1;
+    uint16x8_t vec =
+        vtstq_u16(vdupq_n_u16(m), vld1q_u16(_sse2neon_cmpestr_mask16b));
+    for (int j = 0; j < lb; j++) {
+        mtx[j] = vreinterpretq_m128i_u16(
+            vandq_u16(vec, vreinterpretq_u16_m128i(mtx[j])));
+        mtx[j] = vreinterpretq_m128i_u16(
+            vshrq_n_u16(vreinterpretq_u16_m128i(mtx[j]), 15));
+        int tmp = _sse2neon_vaddvq_u16(vreinterpretq_u16_m128i(mtx[j])) ? 1 : 0;
+        res |= (tmp << j);
+    }
+    return res;
+}
+
+/* clang-format off */
+#define SSE2NEON_GENERATE_CMP_EQUAL_ANY(prefix) \
+    prefix##IMPL(byte) \
+    prefix##IMPL(word)
+/* clang-format on */
+
+SSE2NEON_GENERATE_CMP_EQUAL_ANY(SSE2NEON_CMP_EQUAL_ANY_)
+
+static int _sse2neon_aggregate_ranges_16x8(int la, int lb, __m128i mtx[16])
+{
+    int res = 0;
+    int m = (1 << la) - 1;
+    uint16x8_t vec =
+        vtstq_u16(vdupq_n_u16(m), vld1q_u16(_sse2neon_cmpestr_mask16b));
+    for (int j = 0; j < lb; j++) {
+        mtx[j] = vreinterpretq_m128i_u16(
+            vandq_u16(vec, vreinterpretq_u16_m128i(mtx[j])));
+        mtx[j] = vreinterpretq_m128i_u16(
+            vshrq_n_u16(vreinterpretq_u16_m128i(mtx[j]), 15));
+        __m128i tmp = vreinterpretq_m128i_u32(
+            vshrq_n_u32(vreinterpretq_u32_m128i(mtx[j]), 16));
+        uint32x4_t vec_res = vandq_u32(vreinterpretq_u32_m128i(mtx[j]),
+                                       vreinterpretq_u32_m128i(tmp));
+#if defined(__aarch64__) || defined(_M_ARM64)
+        int t = vaddvq_u32(vec_res) ? 1 : 0;
+#else
+        uint64x2_t sumh = vpaddlq_u32(vec_res);
+        int t = vgetq_lane_u64(sumh, 0) + vgetq_lane_u64(sumh, 1);
+#endif
+        res |= (t << j);
+    }
+    return res;
+}
+
+static int _sse2neon_aggregate_ranges_8x16(int la, int lb, __m128i mtx[16])
+{
+    int res = 0;
+    int m = (1 << la) - 1;
+    uint8x8_t vec_mask = vld1_u8(_sse2neon_cmpestr_mask8b);
+    uint8x8_t t_lo = vtst_u8(vdup_n_u8(m & 0xff), vec_mask);
+    uint8x8_t t_hi = vtst_u8(vdup_n_u8(m >> 8), vec_mask);
+    uint8x16_t vec = vcombine_u8(t_lo, t_hi);
+    for (int j = 0; j < lb; j++) {
+        mtx[j] = vreinterpretq_m128i_u8(
+            vandq_u8(vec, vreinterpretq_u8_m128i(mtx[j])));
+        mtx[j] = vreinterpretq_m128i_u8(
+            vshrq_n_u8(vreinterpretq_u8_m128i(mtx[j]), 7));
+        __m128i tmp = vreinterpretq_m128i_u16(
+            vshrq_n_u16(vreinterpretq_u16_m128i(mtx[j]), 8));
+        uint16x8_t vec_res = vandq_u16(vreinterpretq_u16_m128i(mtx[j]),
+                                       vreinterpretq_u16_m128i(tmp));
+        int t = _sse2neon_vaddvq_u16(vec_res) ? 1 : 0;
+        res |= (t << j);
+    }
+    return res;
+}
+
+#define SSE2NEON_CMP_RANGES_IS_BYTE 1
+#define SSE2NEON_CMP_RANGES_IS_WORD 0
+
+/* clang-format off */
+#define SSE2NEON_GENERATE_CMP_RANGES(prefix)             \
+    prefix##IMPL(byte, uint, u, prefix##IS_BYTE)         \
+    prefix##IMPL(byte, int, s, prefix##IS_BYTE)          \
+    prefix##IMPL(word, uint, u, prefix##IS_WORD)         \
+    prefix##IMPL(word, int, s, prefix##IS_WORD)
+/* clang-format on */
+
+SSE2NEON_GENERATE_CMP_RANGES(SSE2NEON_CMP_RANGES_)
+
+#undef SSE2NEON_CMP_RANGES_IS_BYTE
+#undef SSE2NEON_CMP_RANGES_IS_WORD
+
+static int _sse2neon_cmp_byte_equal_each(__m128i a, int la, __m128i b, int lb)
+{
+    uint8x16_t mtx =
+        vceqq_u8(vreinterpretq_u8_m128i(a), vreinterpretq_u8_m128i(b));
+    int m0 = (la < lb) ? 0 : ((1 << la) - (1 << lb));
+    int m1 = 0x10000 - (1 << la);
+    int tb = 0x10000 - (1 << lb);
+    uint8x8_t vec_mask, vec0_lo, vec0_hi, vec1_lo, vec1_hi;
+    uint8x8_t tmp_lo, tmp_hi, res_lo, res_hi;
+    vec_mask = vld1_u8(_sse2neon_cmpestr_mask8b);
+    vec0_lo = vtst_u8(vdup_n_u8(m0), vec_mask);
+    vec0_hi = vtst_u8(vdup_n_u8(m0 >> 8), vec_mask);
+    vec1_lo = vtst_u8(vdup_n_u8(m1), vec_mask);
+    vec1_hi = vtst_u8(vdup_n_u8(m1 >> 8), vec_mask);
+    tmp_lo = vtst_u8(vdup_n_u8(tb), vec_mask);
+    tmp_hi = vtst_u8(vdup_n_u8(tb >> 8), vec_mask);
+
+    res_lo = vbsl_u8(vec0_lo, vdup_n_u8(0), vget_low_u8(mtx));
+    res_hi = vbsl_u8(vec0_hi, vdup_n_u8(0), vget_high_u8(mtx));
+    res_lo = vbsl_u8(vec1_lo, tmp_lo, res_lo);
+    res_hi = vbsl_u8(vec1_hi, tmp_hi, res_hi);
+    res_lo = vand_u8(res_lo, vec_mask);
+    res_hi = vand_u8(res_hi, vec_mask);
+
+    int res = _sse2neon_vaddv_u8(res_lo) + (_sse2neon_vaddv_u8(res_hi) << 8);
+    return res;
+}
+
+static int _sse2neon_cmp_word_equal_each(__m128i a, int la, __m128i b, int lb)
+{
+    uint16x8_t mtx =
+        vceqq_u16(vreinterpretq_u16_m128i(a), vreinterpretq_u16_m128i(b));
+    int m0 = (la < lb) ? 0 : ((1 << la) - (1 << lb));
+    int m1 = 0x100 - (1 << la);
+    int tb = 0x100 - (1 << lb);
+    uint16x8_t vec_mask = vld1q_u16(_sse2neon_cmpestr_mask16b);
+    uint16x8_t vec0 = vtstq_u16(vdupq_n_u16(m0), vec_mask);
+    uint16x8_t vec1 = vtstq_u16(vdupq_n_u16(m1), vec_mask);
+    uint16x8_t tmp = vtstq_u16(vdupq_n_u16(tb), vec_mask);
+    mtx = vbslq_u16(vec0, vdupq_n_u16(0), mtx);
+    mtx = vbslq_u16(vec1, tmp, mtx);
+    mtx = vandq_u16(mtx, vec_mask);
+    return _sse2neon_vaddvq_u16(mtx);
+}
+
+#define SSE2NEON_AGGREGATE_EQUAL_ORDER_IS_UBYTE 1
+#define SSE2NEON_AGGREGATE_EQUAL_ORDER_IS_UWORD 0
+
+#define SSE2NEON_AGGREGATE_EQUAL_ORDER_IMPL(size, number_of_lanes, data_type)  \
+    static int _sse2neon_aggregate_equal_ordered_##size##x##number_of_lanes(   \
+        int bound, int la, int lb, __m128i mtx[16])                            \
+    {                                                                          \
+        int res = 0;                                                           \
+        int m1 = SSE2NEON_IIF(data_type)(0x10000, 0x100) - (1 << la);          \
+        uint##size##x8_t vec_mask = SSE2NEON_IIF(data_type)(                   \
+            vld1_u##size(_sse2neon_cmpestr_mask##size##b),                     \
+            vld1q_u##size(_sse2neon_cmpestr_mask##size##b));                   \
+        uint##size##x##number_of_lanes##_t vec1 = SSE2NEON_IIF(data_type)(     \
+            vcombine_u##size(vtst_u##size(vdup_n_u##size(m1), vec_mask),       \
+                             vtst_u##size(vdup_n_u##size(m1 >> 8), vec_mask)), \
+            vtstq_u##size(vdupq_n_u##size(m1), vec_mask));                     \
+        uint##size##x##number_of_lanes##_t vec_minusone = vdupq_n_u##size(-1); \
+        uint##size##x##number_of_lanes##_t vec_zero = vdupq_n_u##size(0);      \
+        for (int j = 0; j < lb; j++) {                                         \
+            mtx[j] = vreinterpretq_m128i_u##size(vbslq_u##size(                \
+                vec1, vec_minusone, vreinterpretq_u##size##_m128i(mtx[j])));   \
+        }                                                                      \
+        for (int j = lb; j < bound; j++) {                                     \
+            mtx[j] = vreinterpretq_m128i_u##size(                              \
+                vbslq_u##size(vec1, vec_minusone, vec_zero));                  \
+        }                                                                      \
+        unsigned SSE2NEON_IIF(data_type)(char, short) *ptr =                   \
+            (unsigned SSE2NEON_IIF(data_type)(char, short) *) mtx;             \
+        for (int i = 0; i < bound; i++) {                                      \
+            int val = 1;                                                       \
+            for (int j = 0, k = i; j < bound - i && k < bound; j++, k++)       \
+                val &= ptr[k * bound + j];                                     \
+            res += val << i;                                                   \
+        }                                                                      \
+        return res;                                                            \
+    }
+
+/* clang-format off */
+#define SSE2NEON_GENERATE_AGGREGATE_EQUAL_ORDER(prefix) \
+    prefix##IMPL(8, 16, prefix##IS_UBYTE)               \
+    prefix##IMPL(16, 8, prefix##IS_UWORD)
+/* clang-format on */
+
+SSE2NEON_GENERATE_AGGREGATE_EQUAL_ORDER(SSE2NEON_AGGREGATE_EQUAL_ORDER_)
+
+#undef SSE2NEON_AGGREGATE_EQUAL_ORDER_IS_UBYTE
+#undef SSE2NEON_AGGREGATE_EQUAL_ORDER_IS_UWORD
+
+/* clang-format off */
+#define SSE2NEON_GENERATE_CMP_EQUAL_ORDERED(prefix) \
+    prefix##IMPL(byte)                              \
+    prefix##IMPL(word)
+/* clang-format on */
+
+SSE2NEON_GENERATE_CMP_EQUAL_ORDERED(SSE2NEON_CMP_EQUAL_ORDERED_)
+
+#define SSE2NEON_CMPESTR_LIST                          \
+    _(CMP_UBYTE_EQUAL_ANY, cmp_byte_equal_any)         \
+    _(CMP_UWORD_EQUAL_ANY, cmp_word_equal_any)         \
+    _(CMP_SBYTE_EQUAL_ANY, cmp_byte_equal_any)         \
+    _(CMP_SWORD_EQUAL_ANY, cmp_word_equal_any)         \
+    _(CMP_UBYTE_RANGES, cmp_ubyte_ranges)              \
+    _(CMP_UWORD_RANGES, cmp_uword_ranges)              \
+    _(CMP_SBYTE_RANGES, cmp_sbyte_ranges)              \
+    _(CMP_SWORD_RANGES, cmp_sword_ranges)              \
+    _(CMP_UBYTE_EQUAL_EACH, cmp_byte_equal_each)       \
+    _(CMP_UWORD_EQUAL_EACH, cmp_word_equal_each)       \
+    _(CMP_SBYTE_EQUAL_EACH, cmp_byte_equal_each)       \
+    _(CMP_SWORD_EQUAL_EACH, cmp_word_equal_each)       \
+    _(CMP_UBYTE_EQUAL_ORDERED, cmp_byte_equal_ordered) \
+    _(CMP_UWORD_EQUAL_ORDERED, cmp_word_equal_ordered) \
+    _(CMP_SBYTE_EQUAL_ORDERED, cmp_byte_equal_ordered) \
+    _(CMP_SWORD_EQUAL_ORDERED, cmp_word_equal_ordered)
+
+enum {
+#define _(name, func_suffix) name,
+    SSE2NEON_CMPESTR_LIST
+#undef _
+};
+typedef int (*cmpestr_func_t)(__m128i a, int la, __m128i b, int lb);
+static cmpestr_func_t _sse2neon_cmpfunc_table[] = {
+#define _(name, func_suffix) _sse2neon_##func_suffix,
+    SSE2NEON_CMPESTR_LIST
+#undef _
+};
+
+FORCE_INLINE int _sse2neon_sido_negative(int res, int lb, int imm8, int bound)
+{
+    switch (imm8 & 0x30) {
+    case _SIDD_NEGATIVE_POLARITY:
+        res ^= 0xffffffff;
+        break;
+    case _SIDD_MASKED_NEGATIVE_POLARITY:
+        res ^= (1 << lb) - 1;
+        break;
+    default:
+        break;
+    }
+
+    return res & ((bound == 8) ? 0xFF : 0xFFFF);
+}
+
+FORCE_INLINE int _sse2neon_clz(unsigned int x)
+{
+#if defined(_MSC_VER) && !defined(__clang__)
+    unsigned long cnt = 0;
+    if (_BitScanReverse(&cnt, x))
+        return 31 - cnt;
+    return 32;
+#else
+    return x != 0 ? __builtin_clz(x) : 32;
+#endif
+}
+
+FORCE_INLINE int _sse2neon_ctz(unsigned int x)
+{
+#if defined(_MSC_VER) && !defined(__clang__)
+    unsigned long cnt = 0;
+    if (_BitScanForward(&cnt, x))
+        return cnt;
+    return 32;
+#else
+    return x != 0 ? __builtin_ctz(x) : 32;
+#endif
+}
+
+FORCE_INLINE int _sse2neon_ctzll(unsigned long long x)
+{
+#ifdef _MSC_VER
+    unsigned long cnt;
+#if defined(SSE2NEON_HAS_BITSCAN64)
+    if (_BitScanForward64(&cnt, x))
+        return (int) (cnt);
+#else
+    if (_BitScanForward(&cnt, (unsigned long) (x)))
+        return (int) cnt;
+    if (_BitScanForward(&cnt, (unsigned long) (x >> 32)))
+        return (int) (cnt + 32);
+#endif /* SSE2NEON_HAS_BITSCAN64 */
+    return 64;
+#else /* assume GNU compatible compilers */
+    return x != 0 ? __builtin_ctzll(x) : 64;
+#endif
+}
+
+#define SSE2NEON_MIN(x, y) (x) < (y) ? (x) : (y)
+
+#define SSE2NEON_CMPSTR_SET_UPPER(var, imm) \
+    const int var = (imm & 0x01) ? 8 : 16
+
+#define SSE2NEON_CMPESTRX_LEN_PAIR(a, b, la, lb) \
+    int tmp1 = la ^ (la >> 31);                  \
+    la = tmp1 - (la >> 31);                      \
+    int tmp2 = lb ^ (lb >> 31);                  \
+    lb = tmp2 - (lb >> 31);                      \
+    la = SSE2NEON_MIN(la, bound);                \
+    lb = SSE2NEON_MIN(lb, bound)
+
+// Compare all pairs of character in string a and b,
+// then aggregate the result.
+// As the only difference of PCMPESTR* and PCMPISTR* is the way to calculate the
+// length of string, we use SSE2NEON_CMP{I,E}STRX_GET_LEN to get the length of
+// string a and b.
+#define SSE2NEON_COMP_AGG(a, b, la, lb, imm8, IE)                  \
+    SSE2NEON_CMPSTR_SET_UPPER(bound, imm8);                        \
+    SSE2NEON_##IE##_LEN_PAIR(a, b, la, lb);                        \
+    int r2 = (_sse2neon_cmpfunc_table[imm8 & 0x0f])(a, la, b, lb); \
+    r2 = _sse2neon_sido_negative(r2, lb, imm8, bound)
+
+#define SSE2NEON_CMPSTR_GENERATE_INDEX(r2, bound, imm8)          \
+    return (r2 == 0) ? bound                                     \
+                     : ((imm8 & 0x40) ? (31 - _sse2neon_clz(r2)) \
+                                      : _sse2neon_ctz(r2))
+
+#define SSE2NEON_CMPSTR_GENERATE_MASK(dst)                                     \
+    __m128i dst = vreinterpretq_m128i_u8(vdupq_n_u8(0));                       \
+    if (imm8 & 0x40) {                                                         \
+        if (bound == 8) {                                                      \
+            uint16x8_t tmp = vtstq_u16(vdupq_n_u16(r2),                        \
+                                       vld1q_u16(_sse2neon_cmpestr_mask16b));  \
+            dst = vreinterpretq_m128i_u16(vbslq_u16(                           \
+                tmp, vdupq_n_u16(-1), vreinterpretq_u16_m128i(dst)));          \
+        } else {                                                               \
+            uint8x16_t vec_r2 =                                                \
+                vcombine_u8(vdup_n_u8(r2), vdup_n_u8(r2 >> 8));                \
+            uint8x16_t tmp =                                                   \
+                vtstq_u8(vec_r2, vld1q_u8(_sse2neon_cmpestr_mask8b));          \
+            dst = vreinterpretq_m128i_u8(                                      \
+                vbslq_u8(tmp, vdupq_n_u8(-1), vreinterpretq_u8_m128i(dst)));   \
+        }                                                                      \
+    } else {                                                                   \
+        if (bound == 16) {                                                     \
+            dst = vreinterpretq_m128i_u16(                                     \
+                vsetq_lane_u16(r2 & 0xffff, vreinterpretq_u16_m128i(dst), 0)); \
+        } else {                                                               \
+            dst = vreinterpretq_m128i_u8(                                      \
+                vsetq_lane_u8(r2 & 0xff, vreinterpretq_u8_m128i(dst), 0));     \
+        }                                                                      \
+    }                                                                          \
+    return dst
+
+// Compare packed strings in a and b with lengths la and lb using the control
+// in imm8, and returns 1 if b did not contain a null character and the
+// resulting mask was zero, and 0 otherwise.
+// https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_cmpestra
+FORCE_INLINE int _mm_cmpestra(__m128i a,
+                              int la,
+                              __m128i b,
+                              int lb,
+                              const int imm8)
+{
+    int lb_cpy = lb;
+    SSE2NEON_COMP_AGG(a, b, la, lb, imm8, CMPESTRX);
+    return !r2 & (lb_cpy > bound);
+}
+
+// Compare packed strings in a and b with lengths la and lb using the control in
+// imm8, and returns 1 if the resulting mask was non-zero, and 0 otherwise.
+// https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_cmpestrc
+FORCE_INLINE int _mm_cmpestrc(__m128i a,
+                              int la,
+                              __m128i b,
+                              int lb,
+                              const int imm8)
+{
+    SSE2NEON_COMP_AGG(a, b, la, lb, imm8, CMPESTRX);
+    return r2 != 0;
+}
+
+// Compare packed strings in a and b with lengths la and lb using the control
+// in imm8, and store the generated index in dst.
+// https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_cmpestri
+FORCE_INLINE int _mm_cmpestri(__m128i a,
+                              int la,
+                              __m128i b,
+                              int lb,
+                              const int imm8)
+{
+    SSE2NEON_COMP_AGG(a, b, la, lb, imm8, CMPESTRX);
+    SSE2NEON_CMPSTR_GENERATE_INDEX(r2, bound, imm8);
+}
+
+// Compare packed strings in a and b with lengths la and lb using the control
+// in imm8, and store the generated mask in dst.
+// https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_cmpestrm
+FORCE_INLINE __m128i
+_mm_cmpestrm(__m128i a, int la, __m128i b, int lb, const int imm8)
+{
+    SSE2NEON_COMP_AGG(a, b, la, lb, imm8, CMPESTRX);
+    SSE2NEON_CMPSTR_GENERATE_MASK(dst);
+}
+
+// Compare packed strings in a and b with lengths la and lb using the control in
+// imm8, and returns bit 0 of the resulting bit mask.
+// https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_cmpestro
+FORCE_INLINE int _mm_cmpestro(__m128i a,
+                              int la,
+                              __m128i b,
+                              int lb,
+                              const int imm8)
+{
+    SSE2NEON_COMP_AGG(a, b, la, lb, imm8, CMPESTRX);
+    return r2 & 1;
+}
+
+// Compare packed strings in a and b with lengths la and lb using the control in
+// imm8, and returns 1 if any character in a was null, and 0 otherwise.
+// https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_cmpestrs
+FORCE_INLINE int _mm_cmpestrs(__m128i a,
+                              int la,
+                              __m128i b,
+                              int lb,
+                              const int imm8)
+{
+    (void) a;
+    (void) b;
+    (void) lb;
+    SSE2NEON_CMPSTR_SET_UPPER(bound, imm8);
+    return la <= (bound - 1);
+}
+
+// Compare packed strings in a and b with lengths la and lb using the control in
+// imm8, and returns 1 if any character in b was null, and 0 otherwise.
+// https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_cmpestrz
+FORCE_INLINE int _mm_cmpestrz(__m128i a,
+                              int la,
+                              __m128i b,
+                              int lb,
+                              const int imm8)
+{
+    (void) a;
+    (void) b;
+    (void) la;
+    SSE2NEON_CMPSTR_SET_UPPER(bound, imm8);
+    return lb <= (bound - 1);
+}
+
+#define SSE2NEON_CMPISTRX_LENGTH(str, len, imm8)                         \
+    do {                                                                 \
+        if (imm8 & 0x01) {                                               \
+            uint16x8_t equal_mask_##str =                                \
+                vceqq_u16(vreinterpretq_u16_m128i(str), vdupq_n_u16(0)); \
+            uint8x8_t res_##str = vshrn_n_u16(equal_mask_##str, 4);      \
+            uint64_t matches_##str =                                     \
+                vget_lane_u64(vreinterpret_u64_u8(res_##str), 0);        \
+            len = _sse2neon_ctzll(matches_##str) >> 3;                   \
+        } else {                                                         \
+            uint16x8_t equal_mask_##str = vreinterpretq_u16_u8(          \
+                vceqq_u8(vreinterpretq_u8_m128i(str), vdupq_n_u8(0)));   \
+            uint8x8_t res_##str = vshrn_n_u16(equal_mask_##str, 4);      \
+            uint64_t matches_##str =                                     \
+                vget_lane_u64(vreinterpret_u64_u8(res_##str), 0);        \
+            len = _sse2neon_ctzll(matches_##str) >> 2;                   \
+        }                                                                \
+    } while (0)
+
+#define SSE2NEON_CMPISTRX_LEN_PAIR(a, b, la, lb) \
+    int la, lb;                                  \
+    do {                                         \
+        SSE2NEON_CMPISTRX_LENGTH(a, la, imm8);   \
+        SSE2NEON_CMPISTRX_LENGTH(b, lb, imm8);   \
+    } while (0)
+
+// Compare packed strings with implicit lengths in a and b using the control in
+// imm8, and returns 1 if b did not contain a null character and the resulting
+// mask was zero, and 0 otherwise.
+// https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_cmpistra
+FORCE_INLINE int _mm_cmpistra(__m128i a, __m128i b, const int imm8)
+{
+    SSE2NEON_COMP_AGG(a, b, la, lb, imm8, CMPISTRX);
+    return !r2 & (lb >= bound);
+}
+
+// Compare packed strings with implicit lengths in a and b using the control in
+// imm8, and returns 1 if the resulting mask was non-zero, and 0 otherwise.
+// https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_cmpistrc
+FORCE_INLINE int _mm_cmpistrc(__m128i a, __m128i b, const int imm8)
+{
+    SSE2NEON_COMP_AGG(a, b, la, lb, imm8, CMPISTRX);
+    return r2 != 0;
+}
+
+// Compare packed strings with implicit lengths in a and b using the control in
+// imm8, and store the generated index in dst.
+// https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_cmpistri
+FORCE_INLINE int _mm_cmpistri(__m128i a, __m128i b, const int imm8)
+{
+    SSE2NEON_COMP_AGG(a, b, la, lb, imm8, CMPISTRX);
+    SSE2NEON_CMPSTR_GENERATE_INDEX(r2, bound, imm8);
+}
+
+// Compare packed strings with implicit lengths in a and b using the control in
+// imm8, and store the generated mask in dst.
+// https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_cmpistrm
+FORCE_INLINE __m128i _mm_cmpistrm(__m128i a, __m128i b, const int imm8)
+{
+    SSE2NEON_COMP_AGG(a, b, la, lb, imm8, CMPISTRX);
+    SSE2NEON_CMPSTR_GENERATE_MASK(dst);
+}
+
+// Compare packed strings with implicit lengths in a and b using the control in
+// imm8, and returns bit 0 of the resulting bit mask.
+// https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_cmpistro
+FORCE_INLINE int _mm_cmpistro(__m128i a, __m128i b, const int imm8)
+{
+    SSE2NEON_COMP_AGG(a, b, la, lb, imm8, CMPISTRX);
+    return r2 & 1;
+}
+
+// Compare packed strings with implicit lengths in a and b using the control in
+// imm8, and returns 1 if any character in a was null, and 0 otherwise.
+// https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_cmpistrs
+FORCE_INLINE int _mm_cmpistrs(__m128i a, __m128i b, const int imm8)
+{
+    (void) b;
+    SSE2NEON_CMPSTR_SET_UPPER(bound, imm8);
+    int la;
+    SSE2NEON_CMPISTRX_LENGTH(a, la, imm8);
+    return la <= (bound - 1);
+}
+
+// Compare packed strings with implicit lengths in a and b using the control in
+// imm8, and returns 1 if any character in b was null, and 0 otherwise.
+// https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_cmpistrz
+FORCE_INLINE int _mm_cmpistrz(__m128i a, __m128i b, const int imm8)
+{
+    (void) a;
+    SSE2NEON_CMPSTR_SET_UPPER(bound, imm8);
+    int lb;
+    SSE2NEON_CMPISTRX_LENGTH(b, lb, imm8);
+    return lb <= (bound - 1);
+}
+
+// Compares the 2 signed 64-bit integers in a and the 2 signed 64-bit integers
+// in b for greater than.
+FORCE_INLINE __m128i _mm_cmpgt_epi64(__m128i a, __m128i b)
+{
+#if defined(__aarch64__) || defined(_M_ARM64)
+    return vreinterpretq_m128i_u64(
+        vcgtq_s64(vreinterpretq_s64_m128i(a), vreinterpretq_s64_m128i(b)));
+#else
+    return vreinterpretq_m128i_s64(vshrq_n_s64(
+        vqsubq_s64(vreinterpretq_s64_m128i(b), vreinterpretq_s64_m128i(a)),
+        63));
+#endif
+}
+
+// Starting with the initial value in crc, accumulates a CRC32 value for
+// unsigned 16-bit integer v, and stores the result in dst.
+// https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_crc32_u16
+FORCE_INLINE uint32_t _mm_crc32_u16(uint32_t crc, uint16_t v)
+{
+#if defined(__aarch64__) && defined(__ARM_FEATURE_CRC32)
+    __asm__ __volatile__("crc32ch %w[c], %w[c], %w[v]\n\t"
+                         : [c] "+r"(crc)
+                         : [v] "r"(v));
+#elif ((__ARM_ARCH == 8) && defined(__ARM_FEATURE_CRC32)) || \
+    (defined(_M_ARM64) && !defined(__clang__))
+    crc = __crc32ch(crc, v);
+#else
+    crc = _mm_crc32_u8(crc, v & 0xff);
+    crc = _mm_crc32_u8(crc, (v >> 8) & 0xff);
+#endif
+    return crc;
+}
+
+// Starting with the initial value in crc, accumulates a CRC32 value for
+// unsigned 32-bit integer v, and stores the result in dst.
+// https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_crc32_u32
+FORCE_INLINE uint32_t _mm_crc32_u32(uint32_t crc, uint32_t v)
+{
+#if defined(__aarch64__) && defined(__ARM_FEATURE_CRC32)
+    __asm__ __volatile__("crc32cw %w[c], %w[c], %w[v]\n\t"
+                         : [c] "+r"(crc)
+                         : [v] "r"(v));
+#elif ((__ARM_ARCH == 8) && defined(__ARM_FEATURE_CRC32)) || \
+    (defined(_M_ARM64) && !defined(__clang__))
+    crc = __crc32cw(crc, v);
+#else
+    crc = _mm_crc32_u16(crc, v & 0xffff);
+    crc = _mm_crc32_u16(crc, (v >> 16) & 0xffff);
+#endif
+    return crc;
+}
+
+// Starting with the initial value in crc, accumulates a CRC32 value for
+// unsigned 64-bit integer v, and stores the result in dst.
+// https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_crc32_u64
+FORCE_INLINE uint64_t _mm_crc32_u64(uint64_t crc, uint64_t v)
+{
+#if defined(__aarch64__) && defined(__ARM_FEATURE_CRC32)
+    __asm__ __volatile__("crc32cx %w[c], %w[c], %x[v]\n\t"
+                         : [c] "+r"(crc)
+                         : [v] "r"(v));
+#elif (defined(_M_ARM64) && !defined(__clang__))
+    crc = __crc32cd((uint32_t) crc, v);
+#else
+    crc = _mm_crc32_u32((uint32_t) (crc), v & 0xffffffff);
+    crc = _mm_crc32_u32((uint32_t) (crc), (v >> 32) & 0xffffffff);
+#endif
+    return crc;
+}
+
+// Starting with the initial value in crc, accumulates a CRC32 value for
+// unsigned 8-bit integer v, and stores the result in dst.
+// https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_crc32_u8
+FORCE_INLINE uint32_t _mm_crc32_u8(uint32_t crc, uint8_t v)
+{
+#if defined(__aarch64__) && defined(__ARM_FEATURE_CRC32)
+    __asm__ __volatile__("crc32cb %w[c], %w[c], %w[v]\n\t"
+                         : [c] "+r"(crc)
+                         : [v] "r"(v));
+#elif ((__ARM_ARCH == 8) && defined(__ARM_FEATURE_CRC32)) || \
+    (defined(_M_ARM64) && !defined(__clang__))
+    crc = __crc32cb(crc, v);
+#else
+    crc ^= v;
+#if defined(__ARM_FEATURE_CRYPTO)
+    // Adapted from: https://mary.rs/lab/crc32/
+    // Barrent reduction
+    uint64x2_t orig =
+        vcombine_u64(vcreate_u64((uint64_t) (crc) << 24), vcreate_u64(0x0));
+    uint64x2_t tmp = orig;
+
+    // Polynomial P(x) of CRC32C
+    uint64_t p = 0x105EC76F1;
+    // Barrett Reduction (in bit-reflected form) constant mu_{64} = \lfloor
+    // 2^{64} / P(x) \rfloor = 0x11f91caf6
+    uint64_t mu = 0x1dea713f1;
+
+    // Multiply by mu_{64}
+    tmp = _sse2neon_vmull_p64(vget_low_u64(tmp), vcreate_u64(mu));
+    // Divide by 2^{64} (mask away the unnecessary bits)
+    tmp =
+        vandq_u64(tmp, vcombine_u64(vcreate_u64(0xFFFFFFFF), vcreate_u64(0x0)));
+    // Multiply by P(x) (shifted left by 1 for alignment reasons)
+    tmp = _sse2neon_vmull_p64(vget_low_u64(tmp), vcreate_u64(p));
+    // Subtract original from result
+    tmp = veorq_u64(tmp, orig);
+
+    // Extract the 'lower' (in bit-reflected sense) 32 bits
+    crc = vgetq_lane_u32(vreinterpretq_u32_u64(tmp), 1);
+#else  // Fall back to the generic table lookup approach
+    // Adapted from: https://create.stephan-brumme.com/crc32/
+    // Apply half-byte comparison algorithm for the best ratio between
+    // performance and lookup table.
+
+    // The lookup table just needs to store every 16th entry
+    // of the standard look-up table.
+    static const uint32_t crc32_half_byte_tbl[] = {
+        0x00000000, 0x105ec76f, 0x20bd8ede, 0x30e349b1, 0x417b1dbc, 0x5125dad3,
+        0x61c69362, 0x7198540d, 0x82f63b78, 0x92a8fc17, 0xa24bb5a6, 0xb21572c9,
+        0xc38d26c4, 0xd3d3e1ab, 0xe330a81a, 0xf36e6f75,
+    };
+
+    crc = (crc >> 4) ^ crc32_half_byte_tbl[crc & 0x0F];
+    crc = (crc >> 4) ^ crc32_half_byte_tbl[crc & 0x0F];
+#endif
+#endif
+    return crc;
+}
+
+/* AES */
+
+#if !defined(__ARM_FEATURE_CRYPTO) && (!defined(_M_ARM64) || defined(__clang__))
+/* clang-format off */
+#define SSE2NEON_AES_SBOX(w)                                           \
+    {                                                                  \
+        w(0x63), w(0x7c), w(0x77), w(0x7b), w(0xf2), w(0x6b), w(0x6f), \
+        w(0xc5), w(0x30), w(0x01), w(0x67), w(0x2b), w(0xfe), w(0xd7), \
+        w(0xab), w(0x76), w(0xca), w(0x82), w(0xc9), w(0x7d), w(0xfa), \
+        w(0x59), w(0x47), w(0xf0), w(0xad), w(0xd4), w(0xa2), w(0xaf), \
+        w(0x9c), w(0xa4), w(0x72), w(0xc0), w(0xb7), w(0xfd), w(0x93), \
+        w(0x26), w(0x36), w(0x3f), w(0xf7), w(0xcc), w(0x34), w(0xa5), \
+        w(0xe5), w(0xf1), w(0x71), w(0xd8), w(0x31), w(0x15), w(0x04), \
+        w(0xc7), w(0x23), w(0xc3), w(0x18), w(0x96), w(0x05), w(0x9a), \
+        w(0x07), w(0x12), w(0x80), w(0xe2), w(0xeb), w(0x27), w(0xb2), \
+        w(0x75), w(0x09), w(0x83), w(0x2c), w(0x1a), w(0x1b), w(0x6e), \
+        w(0x5a), w(0xa0), w(0x52), w(0x3b), w(0xd6), w(0xb3), w(0x29), \
+        w(0xe3), w(0x2f), w(0x84), w(0x53), w(0xd1), w(0x00), w(0xed), \
+        w(0x20), w(0xfc), w(0xb1), w(0x5b), w(0x6a), w(0xcb), w(0xbe), \
+        w(0x39), w(0x4a), w(0x4c), w(0x58), w(0xcf), w(0xd0), w(0xef), \
+        w(0xaa), w(0xfb), w(0x43), w(0x4d), w(0x33), w(0x85), w(0x45), \
+        w(0xf9), w(0x02), w(0x7f), w(0x50), w(0x3c), w(0x9f), w(0xa8), \
+        w(0x51), w(0xa3), w(0x40), w(0x8f), w(0x92), w(0x9d), w(0x38), \
+        w(0xf5), w(0xbc), w(0xb6), w(0xda), w(0x21), w(0x10), w(0xff), \
+        w(0xf3), w(0xd2), w(0xcd), w(0x0c), w(0x13), w(0xec), w(0x5f), \
+        w(0x97), w(0x44), w(0x17), w(0xc4), w(0xa7), w(0x7e), w(0x3d), \
+        w(0x64), w(0x5d), w(0x19), w(0x73), w(0x60), w(0x81), w(0x4f), \
+        w(0xdc), w(0x22), w(0x2a), w(0x90), w(0x88), w(0x46), w(0xee), \
+        w(0xb8), w(0x14), w(0xde), w(0x5e), w(0x0b), w(0xdb), w(0xe0), \
+        w(0x32), w(0x3a), w(0x0a), w(0x49), w(0x06), w(0x24), w(0x5c), \
+        w(0xc2), w(0xd3), w(0xac), w(0x62), w(0x91), w(0x95), w(0xe4), \
+        w(0x79), w(0xe7), w(0xc8), w(0x37), w(0x6d), w(0x8d), w(0xd5), \
+        w(0x4e), w(0xa9), w(0x6c), w(0x56), w(0xf4), w(0xea), w(0x65), \
+        w(0x7a), w(0xae), w(0x08), w(0xba), w(0x78), w(0x25), w(0x2e), \
+        w(0x1c), w(0xa6), w(0xb4), w(0xc6), w(0xe8), w(0xdd), w(0x74), \
+        w(0x1f), w(0x4b), w(0xbd), w(0x8b), w(0x8a), w(0x70), w(0x3e), \
+        w(0xb5), w(0x66), w(0x48), w(0x03), w(0xf6), w(0x0e), w(0x61), \
+        w(0x35), w(0x57), w(0xb9), w(0x86), w(0xc1), w(0x1d), w(0x9e), \
+        w(0xe1), w(0xf8), w(0x98), w(0x11), w(0x69), w(0xd9), w(0x8e), \
+        w(0x94), w(0x9b), w(0x1e), w(0x87), w(0xe9), w(0xce), w(0x55), \
+        w(0x28), w(0xdf), w(0x8c), w(0xa1), w(0x89), w(0x0d), w(0xbf), \
+        w(0xe6), w(0x42), w(0x68), w(0x41), w(0x99), w(0x2d), w(0x0f), \
+        w(0xb0), w(0x54), w(0xbb), w(0x16)                             \
+    }
+#define SSE2NEON_AES_RSBOX(w)                                          \
+    {                                                                  \
+        w(0x52), w(0x09), w(0x6a), w(0xd5), w(0x30), w(0x36), w(0xa5), \
+        w(0x38), w(0xbf), w(0x40), w(0xa3), w(0x9e), w(0x81), w(0xf3), \
+        w(0xd7), w(0xfb), w(0x7c), w(0xe3), w(0x39), w(0x82), w(0x9b), \
+        w(0x2f), w(0xff), w(0x87), w(0x34), w(0x8e), w(0x43), w(0x44), \
+        w(0xc4), w(0xde), w(0xe9), w(0xcb), w(0x54), w(0x7b), w(0x94), \
+        w(0x32), w(0xa6), w(0xc2), w(0x23), w(0x3d), w(0xee), w(0x4c), \
+        w(0x95), w(0x0b), w(0x42), w(0xfa), w(0xc3), w(0x4e), w(0x08), \
+        w(0x2e), w(0xa1), w(0x66), w(0x28), w(0xd9), w(0x24), w(0xb2), \
+        w(0x76), w(0x5b), w(0xa2), w(0x49), w(0x6d), w(0x8b), w(0xd1), \
+        w(0x25), w(0x72), w(0xf8), w(0xf6), w(0x64), w(0x86), w(0x68), \
+        w(0x98), w(0x16), w(0xd4), w(0xa4), w(0x5c), w(0xcc), w(0x5d), \
+        w(0x65), w(0xb6), w(0x92), w(0x6c), w(0x70), w(0x48), w(0x50), \
+        w(0xfd), w(0xed), w(0xb9), w(0xda), w(0x5e), w(0x15), w(0x46), \
+        w(0x57), w(0xa7), w(0x8d), w(0x9d), w(0x84), w(0x90), w(0xd8), \
+        w(0xab), w(0x00), w(0x8c), w(0xbc), w(0xd3), w(0x0a), w(0xf7), \
+        w(0xe4), w(0x58), w(0x05), w(0xb8), w(0xb3), w(0x45), w(0x06), \
+        w(0xd0), w(0x2c), w(0x1e), w(0x8f), w(0xca), w(0x3f), w(0x0f), \
+        w(0x02), w(0xc1), w(0xaf), w(0xbd), w(0x03), w(0x01), w(0x13), \
+        w(0x8a), w(0x6b), w(0x3a), w(0x91), w(0x11), w(0x41), w(0x4f), \
+        w(0x67), w(0xdc), w(0xea), w(0x97), w(0xf2), w(0xcf), w(0xce), \
+        w(0xf0), w(0xb4), w(0xe6), w(0x73), w(0x96), w(0xac), w(0x74), \
+        w(0x22), w(0xe7), w(0xad), w(0x35), w(0x85), w(0xe2), w(0xf9), \
+        w(0x37), w(0xe8), w(0x1c), w(0x75), w(0xdf), w(0x6e), w(0x47), \
+        w(0xf1), w(0x1a), w(0x71), w(0x1d), w(0x29), w(0xc5), w(0x89), \
+        w(0x6f), w(0xb7), w(0x62), w(0x0e), w(0xaa), w(0x18), w(0xbe), \
+        w(0x1b), w(0xfc), w(0x56), w(0x3e), w(0x4b), w(0xc6), w(0xd2), \
+        w(0x79), w(0x20), w(0x9a), w(0xdb), w(0xc0), w(0xfe), w(0x78), \
+        w(0xcd), w(0x5a), w(0xf4), w(0x1f), w(0xdd), w(0xa8), w(0x33), \
+        w(0x88), w(0x07), w(0xc7), w(0x31), w(0xb1), w(0x12), w(0x10), \
+        w(0x59), w(0x27), w(0x80), w(0xec), w(0x5f), w(0x60), w(0x51), \
+        w(0x7f), w(0xa9), w(0x19), w(0xb5), w(0x4a), w(0x0d), w(0x2d), \
+        w(0xe5), w(0x7a), w(0x9f), w(0x93), w(0xc9), w(0x9c), w(0xef), \
+        w(0xa0), w(0xe0), w(0x3b), w(0x4d), w(0xae), w(0x2a), w(0xf5), \
+        w(0xb0), w(0xc8), w(0xeb), w(0xbb), w(0x3c), w(0x83), w(0x53), \
+        w(0x99), w(0x61), w(0x17), w(0x2b), w(0x04), w(0x7e), w(0xba), \
+        w(0x77), w(0xd6), w(0x26), w(0xe1), w(0x69), w(0x14), w(0x63), \
+        w(0x55), w(0x21), w(0x0c), w(0x7d)                             \
+    }
+/* clang-format on */
+
+/* X Macro trick. See https://en.wikipedia.org/wiki/X_Macro */
+#define SSE2NEON_AES_H0(x) (x)
+static const uint8_t _sse2neon_sbox[256] = SSE2NEON_AES_SBOX(SSE2NEON_AES_H0);
+static const uint8_t _sse2neon_rsbox[256] = SSE2NEON_AES_RSBOX(SSE2NEON_AES_H0);
+#undef SSE2NEON_AES_H0
+
+/* x_time function and matrix multiply function */
+#if !defined(__aarch64__) && !defined(_M_ARM64)
+#define SSE2NEON_XT(x) (((x) << 1) ^ ((((x) >> 7) & 1) * 0x1b))
+#define SSE2NEON_MULTIPLY(x, y)                                  \
+    (((y & 1) * x) ^ ((y >> 1 & 1) * SSE2NEON_XT(x)) ^           \
+     ((y >> 2 & 1) * SSE2NEON_XT(SSE2NEON_XT(x))) ^              \
+     ((y >> 3 & 1) * SSE2NEON_XT(SSE2NEON_XT(SSE2NEON_XT(x)))) ^ \
+     ((y >> 4 & 1) * SSE2NEON_XT(SSE2NEON_XT(SSE2NEON_XT(SSE2NEON_XT(x))))))
+#endif
+
+// In the absence of crypto extensions, implement aesenc using regular NEON
+// intrinsics instead. See:
+// https://www.workofard.com/2017/01/accelerated-aes-for-the-arm64-linux-kernel/
+// https://www.workofard.com/2017/07/ghash-for-low-end-cores/ and
+// for more information.
+FORCE_INLINE __m128i _mm_aesenc_si128(__m128i a, __m128i RoundKey)
+{
+#if defined(__aarch64__) || defined(_M_ARM64)
+    static const uint8_t shift_rows[] = {
+        0x0, 0x5, 0xa, 0xf, 0x4, 0x9, 0xe, 0x3,
+        0x8, 0xd, 0x2, 0x7, 0xc, 0x1, 0x6, 0xb,
+    };
+    static const uint8_t ror32by8[] = {
+        0x1, 0x2, 0x3, 0x0, 0x5, 0x6, 0x7, 0x4,
+        0x9, 0xa, 0xb, 0x8, 0xd, 0xe, 0xf, 0xc,
+    };
+
+    uint8x16_t v;
+    uint8x16_t w = vreinterpretq_u8_m128i(a);
+
+    /* shift rows */
+    w = vqtbl1q_u8(w, vld1q_u8(shift_rows));
+
+    /* sub bytes */
+    // Here, we separate the whole 256-bytes table into 4 64-bytes tables, and
+    // look up each of the table. After each lookup, we load the next table
+    // which locates at the next 64-bytes. In the meantime, the index in the
+    // table would be smaller than it was, so the index parameters of
+    // `vqtbx4q_u8()` need to be added the same constant as the loaded tables.
+    v = vqtbl4q_u8(_sse2neon_vld1q_u8_x4(_sse2neon_sbox), w);
+    // 'w-0x40' equals to 'vsubq_u8(w, vdupq_n_u8(0x40))'
+    v = vqtbx4q_u8(v, _sse2neon_vld1q_u8_x4(_sse2neon_sbox + 0x40), w - 0x40);
+    v = vqtbx4q_u8(v, _sse2neon_vld1q_u8_x4(_sse2neon_sbox + 0x80), w - 0x80);
+    v = vqtbx4q_u8(v, _sse2neon_vld1q_u8_x4(_sse2neon_sbox + 0xc0), w - 0xc0);
+
+    /* mix columns */
+    w = (v << 1) ^ (uint8x16_t) (((int8x16_t) v >> 7) & 0x1b);
+    w ^= (uint8x16_t) vrev32q_u16((uint16x8_t) v);
+    w ^= vqtbl1q_u8(v ^ w, vld1q_u8(ror32by8));
+
+    /* add round key */
+    return vreinterpretq_m128i_u8(w) ^ RoundKey;
+
+#else /* ARMv7-A implementation for a table-based AES */
+#define SSE2NEON_AES_B2W(b0, b1, b2, b3)                 \
+    (((uint32_t) (b3) << 24) | ((uint32_t) (b2) << 16) | \
+     ((uint32_t) (b1) << 8) | (uint32_t) (b0))
+// multiplying 'x' by 2 in GF(2^8)
+#define SSE2NEON_AES_F2(x) ((x << 1) ^ (((x >> 7) & 1) * 0x011b /* WPOLY */))
+// multiplying 'x' by 3 in GF(2^8)
+#define SSE2NEON_AES_F3(x) (SSE2NEON_AES_F2(x) ^ x)
+#define SSE2NEON_AES_U0(p) \
+    SSE2NEON_AES_B2W(SSE2NEON_AES_F2(p), p, p, SSE2NEON_AES_F3(p))
+#define SSE2NEON_AES_U1(p) \
+    SSE2NEON_AES_B2W(SSE2NEON_AES_F3(p), SSE2NEON_AES_F2(p), p, p)
+#define SSE2NEON_AES_U2(p) \
+    SSE2NEON_AES_B2W(p, SSE2NEON_AES_F3(p), SSE2NEON_AES_F2(p), p)
+#define SSE2NEON_AES_U3(p) \
+    SSE2NEON_AES_B2W(p, p, SSE2NEON_AES_F3(p), SSE2NEON_AES_F2(p))
+
+    // this generates a table containing every possible permutation of
+    // shift_rows() and sub_bytes() with mix_columns().
+    static const uint32_t ALIGN_STRUCT(16) aes_table[4][256] = {
+        SSE2NEON_AES_SBOX(SSE2NEON_AES_U0),
+        SSE2NEON_AES_SBOX(SSE2NEON_AES_U1),
+        SSE2NEON_AES_SBOX(SSE2NEON_AES_U2),
+        SSE2NEON_AES_SBOX(SSE2NEON_AES_U3),
+    };
+#undef SSE2NEON_AES_B2W
+#undef SSE2NEON_AES_F2
+#undef SSE2NEON_AES_F3
+#undef SSE2NEON_AES_U0
+#undef SSE2NEON_AES_U1
+#undef SSE2NEON_AES_U2
+#undef SSE2NEON_AES_U3
+
+    uint32_t x0 = _mm_cvtsi128_si32(a);  // get a[31:0]
+    uint32_t x1 =
+        _mm_cvtsi128_si32(_mm_shuffle_epi32(a, 0x55));  // get a[63:32]
+    uint32_t x2 =
+        _mm_cvtsi128_si32(_mm_shuffle_epi32(a, 0xAA));  // get a[95:64]
+    uint32_t x3 =
+        _mm_cvtsi128_si32(_mm_shuffle_epi32(a, 0xFF));  // get a[127:96]
+
+    // finish the modulo addition step in mix_columns()
+    __m128i out = _mm_set_epi32(
+        (aes_table[0][x3 & 0xff] ^ aes_table[1][(x0 >> 8) & 0xff] ^
+         aes_table[2][(x1 >> 16) & 0xff] ^ aes_table[3][x2 >> 24]),
+        (aes_table[0][x2 & 0xff] ^ aes_table[1][(x3 >> 8) & 0xff] ^
+         aes_table[2][(x0 >> 16) & 0xff] ^ aes_table[3][x1 >> 24]),
+        (aes_table[0][x1 & 0xff] ^ aes_table[1][(x2 >> 8) & 0xff] ^
+         aes_table[2][(x3 >> 16) & 0xff] ^ aes_table[3][x0 >> 24]),
+        (aes_table[0][x0 & 0xff] ^ aes_table[1][(x1 >> 8) & 0xff] ^
+         aes_table[2][(x2 >> 16) & 0xff] ^ aes_table[3][x3 >> 24]));
+
+    return _mm_xor_si128(out, RoundKey);
+#endif
+}
+
+// Perform one round of an AES decryption flow on data (state) in a using the
+// round key in RoundKey, and store the result in dst.
+// https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_aesdec_si128
+FORCE_INLINE __m128i _mm_aesdec_si128(__m128i a, __m128i RoundKey)
+{
+#if defined(__aarch64__)
+    static const uint8_t inv_shift_rows[] = {
+        0x0, 0xd, 0xa, 0x7, 0x4, 0x1, 0xe, 0xb,
+        0x8, 0x5, 0x2, 0xf, 0xc, 0x9, 0x6, 0x3,
+    };
+    static const uint8_t ror32by8[] = {
+        0x1, 0x2, 0x3, 0x0, 0x5, 0x6, 0x7, 0x4,
+        0x9, 0xa, 0xb, 0x8, 0xd, 0xe, 0xf, 0xc,
+    };
+
+    uint8x16_t v;
+    uint8x16_t w = vreinterpretq_u8_m128i(a);
+
+    // inverse shift rows
+    w = vqtbl1q_u8(w, vld1q_u8(inv_shift_rows));
+
+    // inverse sub bytes
+    v = vqtbl4q_u8(_sse2neon_vld1q_u8_x4(_sse2neon_rsbox), w);
+    v = vqtbx4q_u8(v, _sse2neon_vld1q_u8_x4(_sse2neon_rsbox + 0x40), w - 0x40);
+    v = vqtbx4q_u8(v, _sse2neon_vld1q_u8_x4(_sse2neon_rsbox + 0x80), w - 0x80);
+    v = vqtbx4q_u8(v, _sse2neon_vld1q_u8_x4(_sse2neon_rsbox + 0xc0), w - 0xc0);
+
+    // inverse mix columns
+    // multiplying 'v' by 4 in GF(2^8)
+    w = (v << 1) ^ (uint8x16_t) (((int8x16_t) v >> 7) & 0x1b);
+    w = (w << 1) ^ (uint8x16_t) (((int8x16_t) w >> 7) & 0x1b);
+    v ^= w;
+    v ^= (uint8x16_t) vrev32q_u16((uint16x8_t) w);
+
+    w = (v << 1) ^ (uint8x16_t) (((int8x16_t) v >> 7) &
+                                 0x1b);  // multiplying 'v' by 2 in GF(2^8)
+    w ^= (uint8x16_t) vrev32q_u16((uint16x8_t) v);
+    w ^= vqtbl1q_u8(v ^ w, vld1q_u8(ror32by8));
+
+    // add round key
+    return vreinterpretq_m128i_u8(w) ^ RoundKey;
+
+#else /* ARMv7-A NEON implementation */
+    /* FIXME: optimized for NEON */
+    uint8_t i, e, f, g, h, v[4][4];
+    uint8_t *_a = (uint8_t *) &a;
+    for (i = 0; i < 16; ++i) {
+        v[((i / 4) + (i % 4)) % 4][i % 4] = _sse2neon_rsbox[_a[i]];
+    }
+
+    // inverse mix columns
+    for (i = 0; i < 4; ++i) {
+        e = v[i][0];
+        f = v[i][1];
+        g = v[i][2];
+        h = v[i][3];
+
+        v[i][0] = SSE2NEON_MULTIPLY(e, 0x0e) ^ SSE2NEON_MULTIPLY(f, 0x0b) ^
+                  SSE2NEON_MULTIPLY(g, 0x0d) ^ SSE2NEON_MULTIPLY(h, 0x09);
+        v[i][1] = SSE2NEON_MULTIPLY(e, 0x09) ^ SSE2NEON_MULTIPLY(f, 0x0e) ^
+                  SSE2NEON_MULTIPLY(g, 0x0b) ^ SSE2NEON_MULTIPLY(h, 0x0d);
+        v[i][2] = SSE2NEON_MULTIPLY(e, 0x0d) ^ SSE2NEON_MULTIPLY(f, 0x09) ^
+                  SSE2NEON_MULTIPLY(g, 0x0e) ^ SSE2NEON_MULTIPLY(h, 0x0b);
+        v[i][3] = SSE2NEON_MULTIPLY(e, 0x0b) ^ SSE2NEON_MULTIPLY(f, 0x0d) ^
+                  SSE2NEON_MULTIPLY(g, 0x09) ^ SSE2NEON_MULTIPLY(h, 0x0e);
+    }
+
+    return vreinterpretq_m128i_u8(vld1q_u8((uint8_t *) v)) ^ RoundKey;
+#endif
+}
+
+// Perform the last round of an AES encryption flow on data (state) in a using
+// the round key in RoundKey, and store the result in dst.
+// https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_aesenclast_si128
+FORCE_INLINE __m128i _mm_aesenclast_si128(__m128i a, __m128i RoundKey)
+{
+#if defined(__aarch64__)
+    static const uint8_t shift_rows[] = {
+        0x0, 0x5, 0xa, 0xf, 0x4, 0x9, 0xe, 0x3,
+        0x8, 0xd, 0x2, 0x7, 0xc, 0x1, 0x6, 0xb,
+    };
+
+    uint8x16_t v;
+    uint8x16_t w = vreinterpretq_u8_m128i(a);
+
+    // shift rows
+    w = vqtbl1q_u8(w, vld1q_u8(shift_rows));
+
+    // sub bytes
+    v = vqtbl4q_u8(_sse2neon_vld1q_u8_x4(_sse2neon_sbox), w);
+    v = vqtbx4q_u8(v, _sse2neon_vld1q_u8_x4(_sse2neon_sbox + 0x40), w - 0x40);
+    v = vqtbx4q_u8(v, _sse2neon_vld1q_u8_x4(_sse2neon_sbox + 0x80), w - 0x80);
+    v = vqtbx4q_u8(v, _sse2neon_vld1q_u8_x4(_sse2neon_sbox + 0xc0), w - 0xc0);
+
+    // add round key
+    return vreinterpretq_m128i_u8(v) ^ RoundKey;
+
+#else /* ARMv7-A implementation */
+    uint8_t v[16] = {
+        _sse2neon_sbox[vgetq_lane_u8(vreinterpretq_u8_m128i(a), 0)],
+        _sse2neon_sbox[vgetq_lane_u8(vreinterpretq_u8_m128i(a), 5)],
+        _sse2neon_sbox[vgetq_lane_u8(vreinterpretq_u8_m128i(a), 10)],
+        _sse2neon_sbox[vgetq_lane_u8(vreinterpretq_u8_m128i(a), 15)],
+        _sse2neon_sbox[vgetq_lane_u8(vreinterpretq_u8_m128i(a), 4)],
+        _sse2neon_sbox[vgetq_lane_u8(vreinterpretq_u8_m128i(a), 9)],
+        _sse2neon_sbox[vgetq_lane_u8(vreinterpretq_u8_m128i(a), 14)],
+        _sse2neon_sbox[vgetq_lane_u8(vreinterpretq_u8_m128i(a), 3)],
+        _sse2neon_sbox[vgetq_lane_u8(vreinterpretq_u8_m128i(a), 8)],
+        _sse2neon_sbox[vgetq_lane_u8(vreinterpretq_u8_m128i(a), 13)],
+        _sse2neon_sbox[vgetq_lane_u8(vreinterpretq_u8_m128i(a), 2)],
+        _sse2neon_sbox[vgetq_lane_u8(vreinterpretq_u8_m128i(a), 7)],
+        _sse2neon_sbox[vgetq_lane_u8(vreinterpretq_u8_m128i(a), 12)],
+        _sse2neon_sbox[vgetq_lane_u8(vreinterpretq_u8_m128i(a), 1)],
+        _sse2neon_sbox[vgetq_lane_u8(vreinterpretq_u8_m128i(a), 6)],
+        _sse2neon_sbox[vgetq_lane_u8(vreinterpretq_u8_m128i(a), 11)],
+    };
+
+    return vreinterpretq_m128i_u8(vld1q_u8(v)) ^ RoundKey;
+#endif
+}
+
+// Perform the last round of an AES decryption flow on data (state) in a using
+// the round key in RoundKey, and store the result in dst.
+// https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_aesdeclast_si128
+FORCE_INLINE __m128i _mm_aesdeclast_si128(__m128i a, __m128i RoundKey)
+{
+#if defined(__aarch64__)
+    static const uint8_t inv_shift_rows[] = {
+        0x0, 0xd, 0xa, 0x7, 0x4, 0x1, 0xe, 0xb,
+        0x8, 0x5, 0x2, 0xf, 0xc, 0x9, 0x6, 0x3,
+    };
+
+    uint8x16_t v;
+    uint8x16_t w = vreinterpretq_u8_m128i(a);
+
+    // inverse shift rows
+    w = vqtbl1q_u8(w, vld1q_u8(inv_shift_rows));
+
+    // inverse sub bytes
+    v = vqtbl4q_u8(_sse2neon_vld1q_u8_x4(_sse2neon_rsbox), w);
+    v = vqtbx4q_u8(v, _sse2neon_vld1q_u8_x4(_sse2neon_rsbox + 0x40), w - 0x40);
+    v = vqtbx4q_u8(v, _sse2neon_vld1q_u8_x4(_sse2neon_rsbox + 0x80), w - 0x80);
+    v = vqtbx4q_u8(v, _sse2neon_vld1q_u8_x4(_sse2neon_rsbox + 0xc0), w - 0xc0);
+
+    // add round key
+    return vreinterpretq_m128i_u8(v) ^ RoundKey;
+
+#else /* ARMv7-A NEON implementation */
+    /* FIXME: optimized for NEON */
+    uint8_t v[4][4];
+    uint8_t *_a = (uint8_t *) &a;
+    for (int i = 0; i < 16; ++i) {
+        v[((i / 4) + (i % 4)) % 4][i % 4] = _sse2neon_rsbox[_a[i]];
+    }
+
+    return vreinterpretq_m128i_u8(vld1q_u8((uint8_t *) v)) ^ RoundKey;
+#endif
+}
+
+// Perform the InvMixColumns transformation on a and store the result in dst.
+// https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_aesimc_si128
+FORCE_INLINE __m128i _mm_aesimc_si128(__m128i a)
+{
+#if defined(__aarch64__)
+    static const uint8_t ror32by8[] = {
+        0x1, 0x2, 0x3, 0x0, 0x5, 0x6, 0x7, 0x4,
+        0x9, 0xa, 0xb, 0x8, 0xd, 0xe, 0xf, 0xc,
+    };
+    uint8x16_t v = vreinterpretq_u8_m128i(a);
+    uint8x16_t w;
+
+    // multiplying 'v' by 4 in GF(2^8)
+    w = (v << 1) ^ (uint8x16_t) (((int8x16_t) v >> 7) & 0x1b);
+    w = (w << 1) ^ (uint8x16_t) (((int8x16_t) w >> 7) & 0x1b);
+    v ^= w;
+    v ^= (uint8x16_t) vrev32q_u16((uint16x8_t) w);
+
+    // multiplying 'v' by 2 in GF(2^8)
+    w = (v << 1) ^ (uint8x16_t) (((int8x16_t) v >> 7) & 0x1b);
+    w ^= (uint8x16_t) vrev32q_u16((uint16x8_t) v);
+    w ^= vqtbl1q_u8(v ^ w, vld1q_u8(ror32by8));
+    return vreinterpretq_m128i_u8(w);
+
+#else /* ARMv7-A NEON implementation */
+    uint8_t i, e, f, g, h, v[4][4];
+    vst1q_u8((uint8_t *) v, vreinterpretq_u8_m128i(a));
+    for (i = 0; i < 4; ++i) {
+        e = v[i][0];
+        f = v[i][1];
+        g = v[i][2];
+        h = v[i][3];
+
+        v[i][0] = SSE2NEON_MULTIPLY(e, 0x0e) ^ SSE2NEON_MULTIPLY(f, 0x0b) ^
+                  SSE2NEON_MULTIPLY(g, 0x0d) ^ SSE2NEON_MULTIPLY(h, 0x09);
+        v[i][1] = SSE2NEON_MULTIPLY(e, 0x09) ^ SSE2NEON_MULTIPLY(f, 0x0e) ^
+                  SSE2NEON_MULTIPLY(g, 0x0b) ^ SSE2NEON_MULTIPLY(h, 0x0d);
+        v[i][2] = SSE2NEON_MULTIPLY(e, 0x0d) ^ SSE2NEON_MULTIPLY(f, 0x09) ^
+                  SSE2NEON_MULTIPLY(g, 0x0e) ^ SSE2NEON_MULTIPLY(h, 0x0b);
+        v[i][3] = SSE2NEON_MULTIPLY(e, 0x0b) ^ SSE2NEON_MULTIPLY(f, 0x0d) ^
+                  SSE2NEON_MULTIPLY(g, 0x09) ^ SSE2NEON_MULTIPLY(h, 0x0e);
+    }
+
+    return vreinterpretq_m128i_u8(vld1q_u8((uint8_t *) v));
+#endif
+}
+
+// Assist in expanding the AES cipher key by computing steps towards generating
+// a round key for encryption cipher using data from a and an 8-bit round
+// constant specified in imm8, and store the result in dst.
+// https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_aeskeygenassist_si128
+//
+// Emits the Advanced Encryption Standard (AES) instruction aeskeygenassist.
+// This instruction generates a round key for AES encryption. See
+// https://kazakov.life/2017/11/01/cryptocurrency-mining-on-ios-devices/
+// for details.
+FORCE_INLINE __m128i _mm_aeskeygenassist_si128(__m128i a, const int rcon)
+{
+#if defined(__aarch64__)
+    uint8x16_t _a = vreinterpretq_u8_m128i(a);
+    uint8x16_t v = vqtbl4q_u8(_sse2neon_vld1q_u8_x4(_sse2neon_sbox), _a);
+    v = vqtbx4q_u8(v, _sse2neon_vld1q_u8_x4(_sse2neon_sbox + 0x40), _a - 0x40);
+    v = vqtbx4q_u8(v, _sse2neon_vld1q_u8_x4(_sse2neon_sbox + 0x80), _a - 0x80);
+    v = vqtbx4q_u8(v, _sse2neon_vld1q_u8_x4(_sse2neon_sbox + 0xc0), _a - 0xc0);
+
+    uint32x4_t v_u32 = vreinterpretq_u32_u8(v);
+    uint32x4_t ror_v = vorrq_u32(vshrq_n_u32(v_u32, 8), vshlq_n_u32(v_u32, 24));
+    uint32x4_t ror_xor_v = veorq_u32(ror_v, vdupq_n_u32(rcon));
+
+    return vreinterpretq_m128i_u32(vtrn2q_u32(v_u32, ror_xor_v));
+
+#else /* ARMv7-A NEON implementation */
+    uint32_t X1 = _mm_cvtsi128_si32(_mm_shuffle_epi32(a, 0x55));
+    uint32_t X3 = _mm_cvtsi128_si32(_mm_shuffle_epi32(a, 0xFF));
+    for (int i = 0; i < 4; ++i) {
+        ((uint8_t *) &X1)[i] = _sse2neon_sbox[((uint8_t *) &X1)[i]];
+        ((uint8_t *) &X3)[i] = _sse2neon_sbox[((uint8_t *) &X3)[i]];
+    }
+    return _mm_set_epi32(((X3 >> 8) | (X3 << 24)) ^ rcon, X3,
+                         ((X1 >> 8) | (X1 << 24)) ^ rcon, X1);
+#endif
+}
+#undef SSE2NEON_AES_SBOX
+#undef SSE2NEON_AES_RSBOX
+
+#if defined(__aarch64__)
+#undef SSE2NEON_XT
+#undef SSE2NEON_MULTIPLY
+#endif
+
+#else /* __ARM_FEATURE_CRYPTO */
+// Implements equivalent of 'aesenc' by combining AESE (with an empty key) and
+// AESMC and then manually applying the real key as an xor operation. This
+// unfortunately means an additional xor op; the compiler should be able to
+// optimize this away for repeated calls however. See
+// https://blog.michaelbrase.com/2018/05/08/emulating-x86-aes-intrinsics-on-armv8-a
+// for more details.
+FORCE_INLINE __m128i _mm_aesenc_si128(__m128i a, __m128i b)
+{
+    return vreinterpretq_m128i_u8(veorq_u8(
+        vaesmcq_u8(vaeseq_u8(vreinterpretq_u8_m128i(a), vdupq_n_u8(0))),
+        vreinterpretq_u8_m128i(b)));
+}
+
+// Perform one round of an AES decryption flow on data (state) in a using the
+// round key in RoundKey, and store the result in dst.
+// https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_aesdec_si128
+FORCE_INLINE __m128i _mm_aesdec_si128(__m128i a, __m128i RoundKey)
+{
+    return vreinterpretq_m128i_u8(veorq_u8(
+        vaesimcq_u8(vaesdq_u8(vreinterpretq_u8_m128i(a), vdupq_n_u8(0))),
+        vreinterpretq_u8_m128i(RoundKey)));
+}
+
+// Perform the last round of an AES encryption flow on data (state) in a using
+// the round key in RoundKey, and store the result in dst.
+// https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_aesenclast_si128
+FORCE_INLINE __m128i _mm_aesenclast_si128(__m128i a, __m128i RoundKey)
+{
+    return _mm_xor_si128(vreinterpretq_m128i_u8(vaeseq_u8(
+                             vreinterpretq_u8_m128i(a), vdupq_n_u8(0))),
+                         RoundKey);
+}
+
+// Perform the last round of an AES decryption flow on data (state) in a using
+// the round key in RoundKey, and store the result in dst.
+// https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_aesdeclast_si128
+FORCE_INLINE __m128i _mm_aesdeclast_si128(__m128i a, __m128i RoundKey)
+{
+    return vreinterpretq_m128i_u8(
+        veorq_u8(vaesdq_u8(vreinterpretq_u8_m128i(a), vdupq_n_u8(0)),
+                 vreinterpretq_u8_m128i(RoundKey)));
+}
+
+// Perform the InvMixColumns transformation on a and store the result in dst.
+// https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_aesimc_si128
+FORCE_INLINE __m128i _mm_aesimc_si128(__m128i a)
+{
+    return vreinterpretq_m128i_u8(vaesimcq_u8(vreinterpretq_u8_m128i(a)));
+}
+
+// Assist in expanding the AES cipher key by computing steps towards generating
+// a round key for encryption cipher using data from a and an 8-bit round
+// constant specified in imm8, and store the result in dst."
+// https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_aeskeygenassist_si128
+FORCE_INLINE __m128i _mm_aeskeygenassist_si128(__m128i a, const int rcon)
+{
+    // AESE does ShiftRows and SubBytes on A
+    uint8x16_t u8 = vaeseq_u8(vreinterpretq_u8_m128i(a), vdupq_n_u8(0));
+
+#if !defined(_MSC_VER) || defined(__clang__)
+    uint8x16_t dest = {
+        // Undo ShiftRows step from AESE and extract X1 and X3
+        u8[0x4], u8[0x1], u8[0xE], u8[0xB],  // SubBytes(X1)
+        u8[0x1], u8[0xE], u8[0xB], u8[0x4],  // ROT(SubBytes(X1))
+        u8[0xC], u8[0x9], u8[0x6], u8[0x3],  // SubBytes(X3)
+        u8[0x9], u8[0x6], u8[0x3], u8[0xC],  // ROT(SubBytes(X3))
+    };
+    uint32x4_t r = {0, (unsigned) rcon, 0, (unsigned) rcon};
+    return vreinterpretq_m128i_u8(dest) ^ vreinterpretq_m128i_u32(r);
+#else
+    // We have to do this hack because MSVC is strictly adhering to the CPP
+    // standard, in particular C++03 8.5.1 sub-section 15, which states that
+    // unions must be initialized by their first member type.
+
+    // As per the Windows ARM64 ABI, it is always little endian, so this works
+    __n128 dest{
+        ((uint64_t) u8.n128_u8[0x4] << 0) | ((uint64_t) u8.n128_u8[0x1] << 8) |
+            ((uint64_t) u8.n128_u8[0xE] << 16) |
+            ((uint64_t) u8.n128_u8[0xB] << 24) |
+            ((uint64_t) u8.n128_u8[0x1] << 32) |
+            ((uint64_t) u8.n128_u8[0xE] << 40) |
+            ((uint64_t) u8.n128_u8[0xB] << 48) |
+            ((uint64_t) u8.n128_u8[0x4] << 56),
+        ((uint64_t) u8.n128_u8[0xC] << 0) | ((uint64_t) u8.n128_u8[0x9] << 8) |
+            ((uint64_t) u8.n128_u8[0x6] << 16) |
+            ((uint64_t) u8.n128_u8[0x3] << 24) |
+            ((uint64_t) u8.n128_u8[0x9] << 32) |
+            ((uint64_t) u8.n128_u8[0x6] << 40) |
+            ((uint64_t) u8.n128_u8[0x3] << 48) |
+            ((uint64_t) u8.n128_u8[0xC] << 56)};
+
+    dest.n128_u32[1] = dest.n128_u32[1] ^ rcon;
+    dest.n128_u32[3] = dest.n128_u32[3] ^ rcon;
+
+    return dest;
+#endif
+}
+#endif
+
+/* Others */
+
+// Perform a carry-less multiplication of two 64-bit integers, selected from a
+// and b according to imm8, and store the results in dst.
+// https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_clmulepi64_si128
+FORCE_INLINE __m128i _mm_clmulepi64_si128(__m128i _a, __m128i _b, const int imm)
+{
+    uint64x2_t a = vreinterpretq_u64_m128i(_a);
+    uint64x2_t b = vreinterpretq_u64_m128i(_b);
+    switch (imm & 0x11) {
+    case 0x00:
+        return vreinterpretq_m128i_u64(
+            _sse2neon_vmull_p64(vget_low_u64(a), vget_low_u64(b)));
+    case 0x01:
+        return vreinterpretq_m128i_u64(
+            _sse2neon_vmull_p64(vget_high_u64(a), vget_low_u64(b)));
+    case 0x10:
+        return vreinterpretq_m128i_u64(
+            _sse2neon_vmull_p64(vget_low_u64(a), vget_high_u64(b)));
+    case 0x11:
+        return vreinterpretq_m128i_u64(
+            _sse2neon_vmull_p64(vget_high_u64(a), vget_high_u64(b)));
+    default:
+        abort();
+    }
+}
+
+FORCE_INLINE unsigned int _sse2neon_mm_get_denormals_zero_mode(void)
+{
+    union {
+        fpcr_bitfield field;
+#if defined(__aarch64__) || defined(_M_ARM64)
+        uint64_t value;
+#else
+        uint32_t value;
+#endif
+    } r;
+
+#if defined(__aarch64__) || defined(_M_ARM64)
+    r.value = _sse2neon_get_fpcr();
+#else
+    __asm__ __volatile__("vmrs %0, FPSCR" : "=r"(r.value)); /* read */
+#endif
+
+    return r.field.bit24 ? _MM_DENORMALS_ZERO_ON : _MM_DENORMALS_ZERO_OFF;
+}
+
+// Count the number of bits set to 1 in unsigned 32-bit integer a, and
+// return that count in dst.
+// https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_popcnt_u32
+FORCE_INLINE int _mm_popcnt_u32(unsigned int a)
+{
+#if defined(__aarch64__) || defined(_M_ARM64)
+#if __has_builtin(__builtin_popcount)
+    return __builtin_popcount(a);
+#elif defined(_MSC_VER)
+    return _CountOneBits(a);
+#else
+    return (int) vaddlv_u8(vcnt_u8(vcreate_u8((uint64_t) a)));
+#endif
+#else
+    uint32_t count = 0;
+    uint8x8_t input_val, count8x8_val;
+    uint16x4_t count16x4_val;
+    uint32x2_t count32x2_val;
+
+    input_val = vld1_u8((uint8_t *) &a);
+    count8x8_val = vcnt_u8(input_val);
+    count16x4_val = vpaddl_u8(count8x8_val);
+    count32x2_val = vpaddl_u16(count16x4_val);
+
+    vst1_u32(&count, count32x2_val);
+    return count;
+#endif
+}
+
+// Count the number of bits set to 1 in unsigned 64-bit integer a, and
+// return that count in dst.
+// https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_popcnt_u64
+FORCE_INLINE int64_t _mm_popcnt_u64(uint64_t a)
+{
+#if defined(__aarch64__) || defined(_M_ARM64)
+#if __has_builtin(__builtin_popcountll)
+    return __builtin_popcountll(a);
+#elif defined(_MSC_VER)
+    return _CountOneBits64(a);
+#else
+    return (int64_t) vaddlv_u8(vcnt_u8(vcreate_u8(a)));
+#endif
+#else
+    uint64_t count = 0;
+    uint8x8_t input_val, count8x8_val;
+    uint16x4_t count16x4_val;
+    uint32x2_t count32x2_val;
+    uint64x1_t count64x1_val;
+
+    input_val = vld1_u8((uint8_t *) &a);
+    count8x8_val = vcnt_u8(input_val);
+    count16x4_val = vpaddl_u8(count8x8_val);
+    count32x2_val = vpaddl_u16(count16x4_val);
+    count64x1_val = vpaddl_u32(count32x2_val);
+    vst1_u64(&count, count64x1_val);
+    return count;
+#endif
+}
+
+FORCE_INLINE void _sse2neon_mm_set_denormals_zero_mode(unsigned int flag)
+{
+    // AArch32 Advanced SIMD arithmetic always uses the Flush-to-zero setting,
+    // regardless of the value of the FZ bit.
+    union {
+        fpcr_bitfield field;
+#if defined(__aarch64__) || defined(_M_ARM64)
+        uint64_t value;
+#else
+        uint32_t value;
+#endif
+    } r;
+
+#if defined(__aarch64__) || defined(_M_ARM64)
+    r.value = _sse2neon_get_fpcr();
+#else
+    __asm__ __volatile__("vmrs %0, FPSCR" : "=r"(r.value)); /* read */
+#endif
+
+    r.field.bit24 = (flag & _MM_DENORMALS_ZERO_MASK) == _MM_DENORMALS_ZERO_ON;
+
+#if defined(__aarch64__) || defined(_M_ARM64)
+    _sse2neon_set_fpcr(r.value);
+#else
+    __asm__ __volatile__("vmsr FPSCR, %0" ::"r"(r));        /* write */
+#endif
+}
+
+// Return the current 64-bit value of the processor's time-stamp counter.
+// https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=rdtsc
+FORCE_INLINE uint64_t _rdtsc(void)
+{
+#if defined(__aarch64__) || defined(_M_ARM64)
+    uint64_t val;
+
+    /* According to ARM DDI 0487F.c, from Armv8.0 to Armv8.5 inclusive, the
+     * system counter is at least 56 bits wide; from Armv8.6, the counter
+     * must be 64 bits wide.  So the system counter could be less than 64
+     * bits wide and it is attributed with the flag 'cap_user_time_short'
+     * is true.
+     */
+#if defined(_MSC_VER) && !defined(__clang__)
+    val = _ReadStatusReg(ARM64_SYSREG(3, 3, 14, 0, 2));
+#else
+    __asm__ __volatile__("mrs %0, cntvct_el0" : "=r"(val));
+#endif
+
+    return val;
+#else
+    uint32_t pmccntr, pmuseren, pmcntenset;
+    // Read the user mode Performance Monitoring Unit (PMU)
+    // User Enable Register (PMUSERENR) access permissions.
+    __asm__ __volatile__("mrc p15, 0, %0, c9, c14, 0" : "=r"(pmuseren));
+    if (pmuseren & 1) {  // Allows reading PMUSERENR for user mode code.
+        __asm__ __volatile__("mrc p15, 0, %0, c9, c12, 1" : "=r"(pmcntenset));
+        if (pmcntenset & 0x80000000UL) {  // Is it counting?
+            __asm__ __volatile__("mrc p15, 0, %0, c9, c13, 0" : "=r"(pmccntr));
+            // The counter is set up to count every 64th cycle
+            return (uint64_t) (pmccntr) << 6;
+        }
+    }
+
+    // Fallback to syscall as we can't enable PMUSERENR in user mode.
+    struct timeval tv;
+    gettimeofday(&tv, NULL);
+    return (uint64_t) (tv.tv_sec) * 1000000 + tv.tv_usec;
+#endif
+}
+
+#if defined(__GNUC__) || defined(__clang__)
+#pragma pop_macro("ALIGN_STRUCT")
+#pragma pop_macro("FORCE_INLINE")
+#endif
+
+#if defined(__GNUC__) && !defined(__clang__)
+#pragma GCC pop_options
+#endif
+
+#endif

From 9f2faa4c441ccaa61a993c7f981b5e5ad7b0dd74 Mon Sep 17 00:00:00 2001
From: Even Rouault <even.rouault@spatialys.com>
Date: Sun, 17 Mar 2024 00:24:32 +0100
Subject: [PATCH 20/62] CMake: add detection of potential use of sse2neon.h,
 and a GDAL_ENABLE_ARM_NEON_OPTIMIZATIONS variable

---
 CMakeLists.txt           | 10 ++++++++++
 gcore/include_sse2neon.h | 28 ++++++++++++++++++++++++++++
 gdal.cmake               |  7 +++++++
 3 files changed, 45 insertions(+)
 create mode 100644 gcore/include_sse2neon.h

diff --git a/CMakeLists.txt b/CMakeLists.txt
index 367cda089dca..c9fc4d5e1e4b 100644
--- a/CMakeLists.txt
+++ b/CMakeLists.txt
@@ -95,6 +95,16 @@ if ("${CMAKE_SYSTEM_PROCESSOR}" MATCHES "(x86|AMD64)")
     endif ()
   endif ()
 
+else()
+
+  # Check ability to use Arm Neon optimizations
+  include(CheckCXXSourceCompiles)
+  include(CMakePushCheckState)
+  cmake_push_check_state(RESET)
+  set(CMAKE_REQUIRED_INCLUDES "${CMAKE_CURRENT_SOURCE_DIR}/gcore")
+  check_cxx_source_compiles("#include \"include_sse2neon.h\"\nint main() { return 0; }" SSE2NEON_COMPILES)
+  cmake_pop_check_state()
+
 endif ()
 #
 option(CLANG_TIDY_ENABLED "Run clang-tidy with the compiler." OFF)
diff --git a/gcore/include_sse2neon.h b/gcore/include_sse2neon.h
new file mode 100644
index 000000000000..44ce746fabb8
--- /dev/null
+++ b/gcore/include_sse2neon.h
@@ -0,0 +1,28 @@
+/******************************************************************************
+ *
+ * Project:  GDAL
+ * Purpose:  Includes sse2neon.h headers
+ * Author:   Even Rouault <even dot rouault at spatialys dot com>
+ *
+ ******************************************************************************
+ * Copyright (c) 2024, Even Rouault <even dot rouault at spatialys dot com>
+ *
+ * SPDX-License-Identifier: MIT
+ *****************************************************************************/
+
+#ifndef INCLUDE_SSE2NEON_H
+#define INCLUDE_SSE2NEON_H
+
+#if defined(__GNUC__)
+#pragma GCC system_header
+#endif
+
+// This check is done in sse2neon.h just as a warning. Turn that into an
+// error, so that gdal.cmake doesn't try to use it
+#if !defined(__clang__) && defined(__GNUC__) && __GNUC__ < 10
+#error "sse2neon.h: GCC versions earlier than 10 are not supported."
+#endif
+
+#include "sse2neon.h"
+
+#endif /* INCLUDE_SSE2NEON_H */
diff --git a/gdal.cmake b/gdal.cmake
index 6af60831fb01..1799ea709019 100644
--- a/gdal.cmake
+++ b/gdal.cmake
@@ -40,6 +40,13 @@ option(GDAL_OBJECT_LIBRARIES_POSITION_INDEPENDENT_CODE "Set ON to produce -fPIC
 # Option to set preferred C# compiler
 option(CSHARP_MONO "Whether to force the C# compiler to be Mono" OFF)
 
+if (SSE2NEON_COMPILES)
+  option(GDAL_ENABLE_ARM_NEON_OPTIMIZATIONS "Set ON to use ARM Neon FPU optimizations" ON)
+  if (GDAL_ENABLE_ARM_NEON_OPTIMIZATIONS)
+      message(STATUS "Using ARM Neon optimizations")
+  endif()
+endif()
+
 # This line must be kept early in the CMake instructions. At time of writing,
 # this file is populated only be scripts/install_bash_completions.cmake.in
 install(CODE "file(REMOVE \"${PROJECT_BINARY_DIR}/install_manifest_extra.txt\")")

From f49cebcb72671a6775cb0643d28d77789c6ef397 Mon Sep 17 00:00:00 2001
From: Even Rouault <even.rouault@spatialys.com>
Date: Sun, 3 Nov 2024 22:47:20 +0100
Subject: [PATCH 21/62] sse2neon.h: disable #warning

---
 gcore/sse2neon.h | 3 +++
 1 file changed, 3 insertions(+)

diff --git a/gcore/sse2neon.h b/gcore/sse2neon.h
index 10a1196ebd39..7754a2dc574b 100644
--- a/gcore/sse2neon.h
+++ b/gcore/sse2neon.h
@@ -110,10 +110,13 @@
 #warning "GCC versions earlier than 10 are not supported."
 #endif
 
+// Disabled by GDAL to avoid issues with -Werror
+#if 0
 #ifdef __OPTIMIZE__
 #warning \
     "Report any potential compiler optimization issues when using SSE2NEON. See the 'Optimization' section at https://github.com/DLTcollab/sse2neon."
 #endif
+#endif
 
 /* C language does not allow initializing a variable with a function call. */
 #ifdef __cplusplus

From 0b143747dd1e3dbd8f6ba11bb9592b0c5b2a8698 Mon Sep 17 00:00:00 2001
From: Even Rouault <even.rouault@spatialys.com>
Date: Sun, 17 Mar 2024 00:25:40 +0100
Subject: [PATCH 22/62] Use include_sse2neon.h in gcore/rasterio functionnality

---
 gcore/CMakeLists.txt          |  5 +++-
 gcore/gdal_priv_templates.hpp |  7 ++++-
 gcore/rasterio.cpp            | 53 +++++++++++++++++++++--------------
 gcore/rasterio_ssse3.cpp      | 10 +++++--
 gcore/rasterio_ssse3.h        |  2 +-
 port/cpl_cpu_features.h       |  7 ++++-
 6 files changed, 57 insertions(+), 27 deletions(-)

diff --git a/gcore/CMakeLists.txt b/gcore/CMakeLists.txt
index 08fe7e646b7b..c4b00532c8e3 100644
--- a/gcore/CMakeLists.txt
+++ b/gcore/CMakeLists.txt
@@ -104,7 +104,10 @@ if (NOT GDAL_AUTOLOAD_PLUGINS)
     PROPERTY COMPILE_DEFINITIONS GDAL_NO_AUTOLOAD)
 endif ()
 
-if (HAVE_SSSE3_AT_COMPILE_TIME)
+if (GDAL_ENABLE_ARM_NEON_OPTIMIZATIONS)
+  target_compile_definitions(gcore PRIVATE -DHAVE_SSSE3_AT_COMPILE_TIME -DUSE_NEON_OPTIMIZATIONS)
+  target_sources(gcore PRIVATE rasterio_ssse3.cpp)
+elseif (HAVE_SSSE3_AT_COMPILE_TIME)
   target_compile_definitions(gcore PRIVATE -DHAVE_SSSE3_AT_COMPILE_TIME)
   target_sources(gcore PRIVATE rasterio_ssse3.cpp)
   set_property(
diff --git a/gcore/gdal_priv_templates.hpp b/gcore/gdal_priv_templates.hpp
index 3c20c055687b..cb1631485cf1 100644
--- a/gcore/gdal_priv_templates.hpp
+++ b/gcore/gdal_priv_templates.hpp
@@ -585,9 +585,14 @@ inline void GDALCopy8Words(const Tin *pValueIn, Tout *const pValueOut)
 }
 
 // Needs SSE2
-#if defined(__x86_64) || defined(_M_X64) || defined(USE_SSE2)
+#if defined(__x86_64) || defined(_M_X64) || defined(USE_SSE2) ||               \
+    defined(USE_NEON_OPTIMIZATIONS)
 
+#ifdef USE_NEON_OPTIMIZATIONS
+#include "include_sse2neon.h"
+#else
 #include <emmintrin.h>
+#endif
 
 static inline void GDALCopyXMMToInt32(const __m128i xmm, void *pDest)
 {
diff --git a/gcore/rasterio.cpp b/gcore/rasterio.cpp
index 880e851204b9..c46df08b9b88 100644
--- a/gcore/rasterio.cpp
+++ b/gcore/rasterio.cpp
@@ -41,6 +41,18 @@
 #include "memdataset.h"
 #include "vrtdataset.h"
 
+#if defined(__x86_64) || defined(_M_X64)
+#include <emmintrin.h>
+#define HAVE_SSE2
+#elif defined(USE_NEON_OPTIMIZATIONS)
+#include "include_sse2neon.h"
+#define HAVE_SSE2
+#endif
+
+#ifdef HAVE_SSSE3_AT_COMPILE_TIME
+#include "rasterio_ssse3.h"
+#endif
+
 static void GDALFastCopyByte(const GByte *CPL_RESTRICT pSrcData,
                              int nSrcPixelStride, GByte *CPL_RESTRICT pDstData,
                              int nDstPixelStride, GPtrDiff_t nWordCount);
@@ -2217,9 +2229,7 @@ static void inline GDALCopyWordsT_8atatime(
     }
 }
 
-#if defined(__x86_64) || defined(_M_X64)
-
-#include <emmintrin.h>
+#ifdef HAVE_SSE2
 
 template <class Tout>
 void GDALCopyWordsByteTo16Bit(const GByte *const CPL_RESTRICT pSrcData,
@@ -2630,7 +2640,7 @@ void GDALCopyWordsT(const double *const CPL_RESTRICT pSrcData,
                             nDstPixelStride, nWordCount);
 }
 
-#endif  // defined(__x86_64) || defined(_M_X64)
+#endif  // HAVE_SSE2
 
 template <>
 void GDALCopyWordsT(const float *const CPL_RESTRICT pSrcData,
@@ -3068,13 +3078,7 @@ static inline void GDALUnrolledCopy(T *CPL_RESTRICT pDest,
     GDALUnrolledCopyGeneric<T, srcStride, dstStride>(pDest, pSrc, nIters);
 }
 
-#if (defined(__x86_64) || defined(_M_X64))
-
-#ifdef HAVE_SSSE3_AT_COMPILE_TIME
-
-#include "rasterio_ssse3.h"
-
-#endif
+#ifdef HAVE_SSE2
 
 template <>
 void GDALUnrolledCopy<GByte, 2, 1>(GByte *CPL_RESTRICT pDest,
@@ -3175,7 +3179,7 @@ void GDALUnrolledCopy<GByte, 4, 1>(GByte *CPL_RESTRICT pDest,
         pSrc += 4;
     }
 }
-#endif  // defined(__x86_64) || defined(_M_X64)
+#endif  // HAVE_SSE2
 
 /************************************************************************/
 /*                         GDALFastCopy()                               */
@@ -5299,13 +5303,7 @@ bool GDALBufferHasOnlyNoData(const void *pBuffer, double dfNoDataValue,
     return false;
 }
 
-#if defined(__x86_64) || defined(_M_X64)
-
-#include <emmintrin.h>
-
-#ifdef HAVE_SSSE3_AT_COMPILE_TIME
-#include "rasterio_ssse3.h"
-#endif
+#ifdef HAVE_SSE2
 
 /************************************************************************/
 /*                    GDALDeinterleave3Byte()                           */
@@ -5319,6 +5317,12 @@ GDALDeinterleave3Byte(const GByte *CPL_RESTRICT pabySrc,
                       GByte *CPL_RESTRICT pabyDest0,
                       GByte *CPL_RESTRICT pabyDest1,
                       GByte *CPL_RESTRICT pabyDest2, size_t nIters)
+#ifdef USE_NEON_OPTIMIZATIONS
+{
+    return GDALDeinterleave3Byte_SSSE3(pabySrc, pabyDest0, pabyDest1, pabyDest2,
+                                       nIters);
+}
+#else
 {
 #ifdef HAVE_SSSE3_AT_COMPILE_TIME
     if (CPLHaveRuntimeSSSE3())
@@ -5366,6 +5370,7 @@ GDALDeinterleave3Byte(const GByte *CPL_RESTRICT pabySrc,
         pabyDest2[i] = pabySrc[3 * i + 2];
     }
 }
+#endif
 
 /************************************************************************/
 /*                    GDALDeinterleave4Byte()                           */
@@ -5421,6 +5426,12 @@ static void GDALDeinterleave4Byte(const GByte *CPL_RESTRICT pabySrc,
                                   GByte *CPL_RESTRICT pabyDest1,
                                   GByte *CPL_RESTRICT pabyDest2,
                                   GByte *CPL_RESTRICT pabyDest3, size_t nIters)
+#ifdef USE_NEON_OPTIMIZATIONS
+{
+    return GDALDeinterleave4Byte_SSSE3(pabySrc, pabyDest0, pabyDest1, pabyDest2,
+                                       pabyDest3, nIters);
+}
+#else
 {
 #ifdef HAVE_SSSE3_AT_COMPILE_TIME
     if (CPLHaveRuntimeSSSE3())
@@ -5469,6 +5480,7 @@ static void GDALDeinterleave4Byte(const GByte *CPL_RESTRICT pabySrc,
         pabyDest3[i] = pabySrc[4 * i + 3];
     }
 }
+#endif
 #else
 // GCC autovectorizer does an excellent job
 __attribute__((optimize("tree-vectorize"))) static void GDALDeinterleave4Byte(
@@ -5596,8 +5608,7 @@ void GDALDeinterleave(const void *pSourceBuffer, GDALDataType eSourceDT,
         }
 #if ((defined(__GNUC__) && !defined(__clang__)) ||                             \
      defined(__INTEL_CLANG_COMPILER)) &&                                       \
-    (defined(__x86_64) || defined(_M_X64)) &&                                  \
-    defined(HAVE_SSSE3_AT_COMPILE_TIME)
+    defined(HAVE_SSE2) && defined(HAVE_SSSE3_AT_COMPILE_TIME)
         else if ((eSourceDT == GDT_Int16 || eSourceDT == GDT_UInt16) &&
                  CPLHaveRuntimeSSSE3())
         {
diff --git a/gcore/rasterio_ssse3.cpp b/gcore/rasterio_ssse3.cpp
index 7b770f6030b5..fa9cd6ab24e4 100644
--- a/gcore/rasterio_ssse3.cpp
+++ b/gcore/rasterio_ssse3.cpp
@@ -12,12 +12,18 @@
 
 #include "cpl_port.h"
 
-#if defined(HAVE_SSSE3_AT_COMPILE_TIME) &&                                     \
-    (defined(__x86_64) || defined(_M_X64))
+#if (defined(HAVE_SSSE3_AT_COMPILE_TIME) &&                                    \
+     (defined(__x86_64) || defined(_M_X64))) ||                                \
+    defined(USE_NEON_OPTIMIZATIONS)
 
 #include "rasterio_ssse3.h"
 
+#ifdef USE_NEON_OPTIMIZATIONS
+#include "include_sse2neon.h"
+#else
 #include <tmmintrin.h>
+#endif
+
 #include "gdal_priv_templates.hpp"
 
 void GDALUnrolledCopy_GByte_3_1_SSSE3(GByte *CPL_RESTRICT pDest,
diff --git a/gcore/rasterio_ssse3.h b/gcore/rasterio_ssse3.h
index 57f5d556fd10..ac20e45c6c57 100644
--- a/gcore/rasterio_ssse3.h
+++ b/gcore/rasterio_ssse3.h
@@ -16,7 +16,7 @@
 #include "cpl_port.h"
 
 #if defined(HAVE_SSSE3_AT_COMPILE_TIME) &&                                     \
-    (defined(__x86_64) || defined(_M_X64))
+    (defined(__x86_64) || defined(_M_X64) || defined(USE_NEON_OPTIMIZATIONS))
 
 void GDALUnrolledCopy_GByte_3_1_SSSE3(GByte *CPL_RESTRICT pDest,
                                       const GByte *CPL_RESTRICT pSrc,
diff --git a/port/cpl_cpu_features.h b/port/cpl_cpu_features.h
index 9106ed5c39e5..10d3daaf9f1b 100644
--- a/port/cpl_cpu_features.h
+++ b/port/cpl_cpu_features.h
@@ -31,7 +31,12 @@ bool CPLHaveRuntimeSSE();
 #endif
 #endif
 
-#ifdef HAVE_SSSE3_AT_COMPILE_TIME
+#ifdef USE_NEON_OPTIMIZATIONS
+static bool inline CPLHaveRuntimeSSSE3()
+{
+    return true;
+}
+#elif defined(HAVE_SSSE3_AT_COMPILE_TIME)
 #if __SSSE3__
 #define HAVE_INLINE_SSSE3
 

From 6fa1a8e6dc4f9475e5f0db0f314adabbdbce79f6 Mon Sep 17 00:00:00 2001
From: Even Rouault <even.rouault@spatialys.com>
Date: Sun, 17 Mar 2024 02:08:03 +0100
Subject: [PATCH 23/62] include_sse2neon.h: add missing _MM_SHUFFLE2()

---
 gcore/include_sse2neon.h | 4 ++++
 1 file changed, 4 insertions(+)

diff --git a/gcore/include_sse2neon.h b/gcore/include_sse2neon.h
index 44ce746fabb8..fd1dbb927cb8 100644
--- a/gcore/include_sse2neon.h
+++ b/gcore/include_sse2neon.h
@@ -25,4 +25,8 @@
 
 #include "sse2neon.h"
 
+#ifndef _MM_SHUFFLE2
+#define _MM_SHUFFLE2(fp1, fp0) (((fp1) << 1) | (fp0))
+#endif
+
 #endif /* INCLUDE_SSE2NEON_H */

From d922dc5763efd249072c73b7139887152b2f16c1 Mon Sep 17 00:00:00 2001
From: Even Rouault <even.rouault@spatialys.com>
Date: Sun, 17 Mar 2024 02:41:53 +0100
Subject: [PATCH 24/62] Use include_sse2neon.h in gcore/overview functionality

---
 gcore/gdalsse_priv.h |  7 ++++++-
 gcore/overview.cpp   | 12 +++++++++---
 2 files changed, 15 insertions(+), 4 deletions(-)

diff --git a/gcore/gdalsse_priv.h b/gcore/gdalsse_priv.h
index 3c7ec7ba8cdd..ade33367ee55 100644
--- a/gcore/gdalsse_priv.h
+++ b/gcore/gdalsse_priv.h
@@ -23,13 +23,18 @@
 #if (defined(__x86_64) || defined(_M_X64) || defined(USE_SSE2)) &&             \
     !defined(USE_SSE2_EMULATION)
 
+#include <string.h>
+
+#ifdef USE_NEON_OPTIMIZATIONS
+#include "include_sse2neon.h"
+#else
 /* Requires SSE2 */
 #include <emmintrin.h>
-#include <string.h>
 
 #ifdef __SSE4_1__
 #include <smmintrin.h>
 #endif
+#endif
 
 #include "gdal_priv_templates.hpp"
 
diff --git a/gcore/overview.cpp b/gcore/overview.cpp
index 5867ac11b04f..846c89a91e4e 100644
--- a/gcore/overview.cpp
+++ b/gcore/overview.cpp
@@ -36,9 +36,15 @@
 #include "gdal_thread_pool.h"
 #include "gdalwarper.h"
 
+#ifdef USE_NEON_OPTIMIZATIONS
+#include "include_sse2neon.h"
+#define USE_SSE2
+
+#include "gdalsse_priv.h"
+
 // Restrict to 64bit processors because they are guaranteed to have SSE2,
 // or if __AVX2__ is defined.
-#if defined(__x86_64) || defined(_M_X64) || defined(__AVX2__)
+#elif defined(__x86_64) || defined(_M_X64) || defined(__AVX2__)
 #define USE_SSE2
 
 #include "gdalsse_priv.h"
@@ -335,7 +341,7 @@ inline GUInt16 ComputeIntegerRMS_4values<GUInt16, double>(double sumSquares)
 /*                   QuadraticMeanByteSSE2OrAVX2()                      */
 /************************************************************************/
 
-#ifdef __SSE4_1__
+#if defined(__SSE4_1__) || defined(USE_NEON_OPTIMIZATIONS)
 #define sse2_packus_epi32 _mm_packus_epi32
 #else
 inline __m128i sse2_packus_epi32(__m128i a, __m128i b)
@@ -350,7 +356,7 @@ inline __m128i sse2_packus_epi32(__m128i a, __m128i b)
 }
 #endif
 
-#ifdef __SSSE3__
+#if defined(__SSSE3__) || defined(USE_NEON_OPTIMIZATIONS)
 #define sse2_hadd_epi16 _mm_hadd_epi16
 #else
 inline __m128i sse2_hadd_epi16(__m128i a, __m128i b)

From b42f7a8a3853a4b747d8291a744efa402e15d9bf Mon Sep 17 00:00:00 2001
From: Even Rouault <even.rouault@spatialys.com>
Date: Sun, 3 Nov 2024 22:30:20 +0100
Subject: [PATCH 25/62] Use include_sse2neon.h in gcore/gdal_minmax_element.hpp

---
 gcore/gdal_minmax_element.hpp | 6 +++++-
 perftests/CMakeLists.txt      | 3 +++
 2 files changed, 8 insertions(+), 1 deletion(-)

diff --git a/gcore/gdal_minmax_element.hpp b/gcore/gdal_minmax_element.hpp
index 0da63649a06d..a213c36eea71 100644
--- a/gcore/gdal_minmax_element.hpp
+++ b/gcore/gdal_minmax_element.hpp
@@ -33,14 +33,18 @@
 #error "Please define the GDAL_MINMAXELT_NS macro to define the namespace"
 #endif
 
+#ifdef USE_NEON_OPTIMIZATIONS
+#include "include_sse2neon.h"
+#define GDAL_MINMAX_ELEMENT_USE_SSE2
+#else
 #if defined(__x86_64) || defined(_M_X64)
 #define GDAL_MINMAX_ELEMENT_USE_SSE2
 #endif
-
 #ifdef GDAL_MINMAX_ELEMENT_USE_SSE2
 // SSE2 header
 #include <emmintrin.h>
 #endif
+#endif
 
 #include "gdal_priv_templates.hpp"
 #if GDAL_VERSION < GDAL_COMPUTE_VERSION(3, 10, 0)
diff --git a/perftests/CMakeLists.txt b/perftests/CMakeLists.txt
index 1c7546dcef97..a103013d3c9b 100644
--- a/perftests/CMakeLists.txt
+++ b/perftests/CMakeLists.txt
@@ -25,5 +25,8 @@ gdal_standard_includes(bench_ogr_c_api)
 target_link_libraries(bench_ogr_c_api PRIVATE $<TARGET_NAME:${GDAL_LIB_TARGET_NAME}>)
 
 gdal_test_target(testperf_gdal_minmax_element testperf_gdal_minmax_element.cpp)
+if (GDAL_ENABLE_ARM_NEON_OPTIMIZATIONS)
+  target_compile_definitions(testperf_gdal_minmax_element PRIVATE -DUSE_NEON_OPTIMIZATIONS)
+endif()
 add_test(NAME testperf_gdal_minmax_element COMMAND testperf_gdal_minmax_element)
 set_property(TEST testperf_gdal_minmax_element PROPERTY ENVIRONMENT "${TEST_ENV}")

From df45c33db2fbf7d8ff7a4665e62702eabd3052c5 Mon Sep 17 00:00:00 2001
From: Even Rouault <even.rouault@spatialys.com>
Date: Mon, 4 Nov 2024 03:50:55 +0100
Subject: [PATCH 26/62] scripts/fix_typos.sh: skip sse2neon.h

---
 scripts/fix_typos.sh | 1 +
 1 file changed, 1 insertion(+)

diff --git a/scripts/fix_typos.sh b/scripts/fix_typos.sh
index f218033606eb..564212398542 100755
--- a/scripts/fix_typos.sh
+++ b/scripts/fix_typos.sh
@@ -77,6 +77,7 @@ EXCLUDED_FILES="$EXCLUDED_FILES,./cmake/modules/CMakeCheckCompilerFlagCommonPatt
 EXCLUDED_FILES="$EXCLUDED_FILES,./cmake/modules/Copyright.txt"
 EXCLUDED_FILES="$EXCLUDED_FILES,*/sqlite_rtree_bulk_load/*"
 EXCLUDED_FILES="$EXCLUDED_FILES,ogr_adbc_internal.h"
+EXCLUDED_FILES="$EXCLUDED_FILES,sse2neon.h"
 EXCLUDED_FILES="$EXCLUDED_FILES,*/spelling_wordlist.txt"
 AUTHORIZED_LIST="poSession,FIDN,TRAFIC,HTINK,repID,oCurr,INTREST,oPosition"
 AUTHORIZED_LIST="$AUTHORIZED_LIST,CPL_SUPRESS_CPLUSPLUS,SRP_NAM,ADRG_NAM,'SRP_NAM,AuxilaryTarget"

From 06ea2d41e689cfce93fff7f5e40e120c06e3bd41 Mon Sep 17 00:00:00 2001
From: Even Rouault <even.rouault@spatialys.com>
Date: Mon, 4 Nov 2024 03:06:44 +0100
Subject: [PATCH 27/62] gdal_minmax_element.hpp: use SSE4.1 _mm_blendv_XXX()
 when possible

---
 gcore/gdal_minmax_element.hpp | 47 +++++++++++++++++++++++++++--------
 1 file changed, 37 insertions(+), 10 deletions(-)

diff --git a/gcore/gdal_minmax_element.hpp b/gcore/gdal_minmax_element.hpp
index a213c36eea71..5faeee772547 100644
--- a/gcore/gdal_minmax_element.hpp
+++ b/gcore/gdal_minmax_element.hpp
@@ -43,6 +43,9 @@
 #ifdef GDAL_MINMAX_ELEMENT_USE_SSE2
 // SSE2 header
 #include <emmintrin.h>
+#ifdef __SSE4_1__
+#include <smmintrin.h>
+#endif
 #endif
 #endif
 
@@ -294,6 +297,35 @@ static inline __m128i comp(SSE_T x, SSE_T y)
     }
 }
 
+template <class T> static inline T blendv(T a, T b, T mask);
+
+template <> __m128i blendv(__m128i a, __m128i b, __m128i mask)
+{
+#if defined(__SSE4_1__) || defined(USE_NEON_OPTIMIZATIONS)
+    return _mm_blendv_epi8(a, b, mask);
+#else
+    return _mm_or_si128(_mm_andnot_si128(mask, a), _mm_and_si128(mask, b));
+#endif
+}
+
+template <> __m128 blendv(__m128 a, __m128 b, __m128 mask)
+{
+#if defined(__SSE4_1__) || defined(USE_NEON_OPTIMIZATIONS)
+    return _mm_blendv_ps(a, b, mask);
+#else
+    return _mm_or_ps(_mm_andnot_ps(mask, a), _mm_and_ps(mask, b));
+#endif
+}
+
+template <> __m128d blendv(__m128d a, __m128d b, __m128d mask)
+{
+#if defined(__SSE4_1__) || defined(USE_NEON_OPTIMIZATIONS)
+    return _mm_blendv_pd(a, b, mask);
+#else
+    return _mm_or_pd(_mm_andnot_pd(mask, a), _mm_and_pd(mask, b));
+#endif
+}
+
 // Using SSE2
 template <class T, bool IS_MAX, bool HAS_NODATA>
 inline size_t extremum_element_with_nan(const T *v, size_t size, T noDataValue)
@@ -412,8 +444,7 @@ inline size_t extremum_element_with_nan(const T *v, size_t size, T noDataValue)
                     [sse_neutral, sse_nodata](auto sse_val)
                 {
                     const auto eq_nodata = _mm_cmpeq_epi8(sse_val, sse_nodata);
-                    return _mm_or_si128(_mm_and_si128(eq_nodata, sse_neutral),
-                                        _mm_andnot_si128(eq_nodata, sse_val));
+                    return blendv(sse_val, sse_neutral, eq_nodata);
                 };
 
                 sse_val0 = replaceNoDataByNeutral(sse_val0);
@@ -428,8 +459,7 @@ inline size_t extremum_element_with_nan(const T *v, size_t size, T noDataValue)
                     [sse_neutral, sse_nodata](auto sse_val)
                 {
                     const auto eq_nodata = _mm_cmpeq_epi16(sse_val, sse_nodata);
-                    return _mm_or_si128(_mm_and_si128(eq_nodata, sse_neutral),
-                                        _mm_andnot_si128(eq_nodata, sse_val));
+                    return blendv(sse_val, sse_neutral, eq_nodata);
                 };
 
                 sse_val0 = replaceNoDataByNeutral(sse_val0);
@@ -444,8 +474,7 @@ inline size_t extremum_element_with_nan(const T *v, size_t size, T noDataValue)
                     [sse_neutral, sse_nodata](auto sse_val)
                 {
                     const auto eq_nodata = _mm_cmpeq_epi32(sse_val, sse_nodata);
-                    return _mm_or_si128(_mm_and_si128(eq_nodata, sse_neutral),
-                                        _mm_andnot_si128(eq_nodata, sse_val));
+                    return blendv(sse_val, sse_neutral, eq_nodata);
                 };
 
                 sse_val0 = replaceNoDataByNeutral(sse_val0);
@@ -459,8 +488,7 @@ inline size_t extremum_element_with_nan(const T *v, size_t size, T noDataValue)
                     [sse_neutral, sse_nodata](auto sse_val)
                 {
                     const auto eq_nodata = _mm_cmpeq_ps(sse_val, sse_nodata);
-                    return _mm_or_ps(_mm_and_ps(eq_nodata, sse_neutral),
-                                     _mm_andnot_ps(eq_nodata, sse_val));
+                    return blendv(sse_val, sse_neutral, eq_nodata);
                 };
 
                 sse_val0 = replaceNoDataByNeutral(sse_val0);
@@ -474,8 +502,7 @@ inline size_t extremum_element_with_nan(const T *v, size_t size, T noDataValue)
                     [sse_neutral, sse_nodata](auto sse_val)
                 {
                     const auto eq_nodata = _mm_cmpeq_pd(sse_val, sse_nodata);
-                    return _mm_or_pd(_mm_and_pd(eq_nodata, sse_neutral),
-                                     _mm_andnot_pd(eq_nodata, sse_val));
+                    return blendv(sse_val, sse_neutral, eq_nodata);
                 };
 
                 sse_val0 = replaceNoDataByNeutral(sse_val0);

From 0d4970faf39249e40754a76f4d8eff86e8098fba Mon Sep 17 00:00:00 2001
From: Even Rouault <even.rouault@spatialys.com>
Date: Mon, 4 Nov 2024 03:22:43 +0100
Subject: [PATCH 28/62] gdal_minmax_element.hpp: improve code

---
 gcore/gdal_minmax_element.hpp | 132 ++++++++++++++--------------------
 1 file changed, 52 insertions(+), 80 deletions(-)

diff --git a/gcore/gdal_minmax_element.hpp b/gcore/gdal_minmax_element.hpp
index 5faeee772547..1e6f7db9b432 100644
--- a/gcore/gdal_minmax_element.hpp
+++ b/gcore/gdal_minmax_element.hpp
@@ -297,6 +297,48 @@ static inline __m128i comp(SSE_T x, SSE_T y)
     }
 }
 
+template <class T, class SSE_T> static inline SSE_T compeq(SSE_T a, SSE_T b);
+
+template <> __m128i compeq<uint8_t>(__m128i a, __m128i b)
+{
+    return _mm_cmpeq_epi8(a, b);
+}
+
+template <> __m128i compeq<int8_t>(__m128i a, __m128i b)
+{
+    return _mm_cmpeq_epi8(a, b);
+}
+
+template <> __m128i compeq<uint16_t>(__m128i a, __m128i b)
+{
+    return _mm_cmpeq_epi16(a, b);
+}
+
+template <> __m128i compeq<int16_t>(__m128i a, __m128i b)
+{
+    return _mm_cmpeq_epi16(a, b);
+}
+
+template <> __m128i compeq<uint32_t>(__m128i a, __m128i b)
+{
+    return _mm_cmpeq_epi32(a, b);
+}
+
+template <> __m128i compeq<int32_t>(__m128i a, __m128i b)
+{
+    return _mm_cmpeq_epi32(a, b);
+}
+
+template <> __m128 compeq<float>(__m128 a, __m128 b)
+{
+    return _mm_cmpeq_ps(a, b);
+}
+
+template <> __m128d compeq<double>(__m128d a, __m128d b)
+{
+    return _mm_cmpeq_pd(a, b);
+}
+
 template <class T> static inline T blendv(T a, T b, T mask);
 
 template <> __m128i blendv(__m128i a, __m128i b, __m128i mask)
@@ -437,87 +479,17 @@ inline size_t extremum_element_with_nan(const T *v, size_t size, T noDataValue)
         {
             // Replace all components that are at the nodata value by a
             // neutral value (current minimum)
-            if constexpr (std::is_same_v<T, uint8_t> ||
-                          std::is_same_v<T, int8_t>)
+            const auto replaceNoDataByNeutral =
+                [sse_neutral, sse_nodata](auto sse_val)
             {
-                const auto replaceNoDataByNeutral =
-                    [sse_neutral, sse_nodata](auto sse_val)
-                {
-                    const auto eq_nodata = _mm_cmpeq_epi8(sse_val, sse_nodata);
-                    return blendv(sse_val, sse_neutral, eq_nodata);
-                };
-
-                sse_val0 = replaceNoDataByNeutral(sse_val0);
-                sse_val1 = replaceNoDataByNeutral(sse_val1);
-                sse_val2 = replaceNoDataByNeutral(sse_val2);
-                sse_val3 = replaceNoDataByNeutral(sse_val3);
-            }
-            else if constexpr (std::is_same_v<T, uint16_t> ||
-                               std::is_same_v<T, int16_t>)
-            {
-                const auto replaceNoDataByNeutral =
-                    [sse_neutral, sse_nodata](auto sse_val)
-                {
-                    const auto eq_nodata = _mm_cmpeq_epi16(sse_val, sse_nodata);
-                    return blendv(sse_val, sse_neutral, eq_nodata);
-                };
-
-                sse_val0 = replaceNoDataByNeutral(sse_val0);
-                sse_val1 = replaceNoDataByNeutral(sse_val1);
-                sse_val2 = replaceNoDataByNeutral(sse_val2);
-                sse_val3 = replaceNoDataByNeutral(sse_val3);
-            }
-            else if constexpr (std::is_same_v<T, uint32_t> ||
-                               std::is_same_v<T, int32_t>)
-            {
-                const auto replaceNoDataByNeutral =
-                    [sse_neutral, sse_nodata](auto sse_val)
-                {
-                    const auto eq_nodata = _mm_cmpeq_epi32(sse_val, sse_nodata);
-                    return blendv(sse_val, sse_neutral, eq_nodata);
-                };
-
-                sse_val0 = replaceNoDataByNeutral(sse_val0);
-                sse_val1 = replaceNoDataByNeutral(sse_val1);
-                sse_val2 = replaceNoDataByNeutral(sse_val2);
-                sse_val3 = replaceNoDataByNeutral(sse_val3);
-            }
-            else if constexpr (std::is_same_v<T, float>)
-            {
-                const auto replaceNoDataByNeutral =
-                    [sse_neutral, sse_nodata](auto sse_val)
-                {
-                    const auto eq_nodata = _mm_cmpeq_ps(sse_val, sse_nodata);
-                    return blendv(sse_val, sse_neutral, eq_nodata);
-                };
-
-                sse_val0 = replaceNoDataByNeutral(sse_val0);
-                sse_val1 = replaceNoDataByNeutral(sse_val1);
-                sse_val2 = replaceNoDataByNeutral(sse_val2);
-                sse_val3 = replaceNoDataByNeutral(sse_val3);
-            }
-            else if constexpr (std::is_same_v<T, double>)
-            {
-                const auto replaceNoDataByNeutral =
-                    [sse_neutral, sse_nodata](auto sse_val)
-                {
-                    const auto eq_nodata = _mm_cmpeq_pd(sse_val, sse_nodata);
-                    return blendv(sse_val, sse_neutral, eq_nodata);
-                };
-
-                sse_val0 = replaceNoDataByNeutral(sse_val0);
-                sse_val1 = replaceNoDataByNeutral(sse_val1);
-                sse_val2 = replaceNoDataByNeutral(sse_val2);
-                sse_val3 = replaceNoDataByNeutral(sse_val3);
-            }
-            else
-            {
-                static_assert(
-                    std::is_same_v<T, uint8_t> || std::is_same_v<T, int8_t> ||
-                    std::is_same_v<T, uint16_t> || std::is_same_v<T, int16_t> ||
-                    std::is_same_v<T, uint32_t> || std::is_same_v<T, int32_t> ||
-                    std::is_same_v<T, float> || std::is_same_v<T, double>);
-            }
+                const auto eq_nodata = compeq<T>(sse_val, sse_nodata);
+                return blendv(sse_val, sse_neutral, eq_nodata);
+            };
+
+            sse_val0 = replaceNoDataByNeutral(sse_val0);
+            sse_val1 = replaceNoDataByNeutral(sse_val1);
+            sse_val2 = replaceNoDataByNeutral(sse_val2);
+            sse_val3 = replaceNoDataByNeutral(sse_val3);
         }
 
         if (_mm_movemask_epi8(_mm_or_si128(

From 648add692701007aaa34a5e4b857a6334e0f399a Mon Sep 17 00:00:00 2001
From: Even Rouault <even.rouault@spatialys.com>
Date: Mon, 4 Nov 2024 13:34:30 +0100
Subject: [PATCH 29/62] gcore/CMakeLists.txt: also install
 gdal_priv_templates.hpp

---
 gcore/CMakeLists.txt | 1 +
 1 file changed, 1 insertion(+)

diff --git a/gcore/CMakeLists.txt b/gcore/CMakeLists.txt
index 08fe7e646b7b..bc710a7fcc65 100644
--- a/gcore/CMakeLists.txt
+++ b/gcore/CMakeLists.txt
@@ -210,6 +210,7 @@ target_public_header(
   gdal_typetraits.h
   gdal_adbc.h
   gdal_minmax_element.hpp
+  gdal_priv_templates.hpp  # Required by gdal_minmax_element.hpp
 )
 
 set(GDAL_DATA_FILES

From 9164d9d55d2d81c4699ce3ee2fb3feaec998b86b Mon Sep 17 00:00:00 2001
From: Even Rouault <even.rouault@spatialys.com>
Date: Mon, 4 Nov 2024 13:31:56 +0100
Subject: [PATCH 30/62] gdal_minmax_element.hpp: fixes when vendoring with GDAL
 < 3.7

---
 gcore/gdal_minmax_element.hpp | 31 ++++++++-----------------------
 1 file changed, 8 insertions(+), 23 deletions(-)

diff --git a/gcore/gdal_minmax_element.hpp b/gcore/gdal_minmax_element.hpp
index 0da63649a06d..b6c30d2e509d 100644
--- a/gcore/gdal_minmax_element.hpp
+++ b/gcore/gdal_minmax_element.hpp
@@ -43,29 +43,6 @@
 #endif
 
 #include "gdal_priv_templates.hpp"
-#if GDAL_VERSION < GDAL_COMPUTE_VERSION(3, 10, 0)
-// For vendoring in other applications
-namespace GDAL_MINMAXELT_NS
-{
-template <class T> inline bool GDALIsValueExactAs(double dfValue)
-{
-    return GDALIsValueInRange<T>(dfValue) &&
-           static_cast<double>(static_cast<T>(dfValue)) == dfValue;
-}
-
-template <> inline bool GDALIsValueExactAs<float>(double dfValue)
-{
-    return std::isnan(dfValue) ||
-           (GDALIsValueInRange<float>(dfValue) &&
-            static_cast<double>(static_cast<float>(dfValue)) == dfValue);
-}
-
-template <> inline bool GDALIsValueExactAs<double>(double)
-{
-    return true;
-}
-}  // namespace GDAL_MINMAXELT_NS
-#endif
 
 namespace GDAL_MINMAXELT_NS
 {
@@ -989,6 +966,7 @@ size_t extremum_element(const void *buffer, size_t nElts, GDALDataType eDT,
 {
     switch (eDT)
     {
+#if GDAL_VERSION_NUM >= GDAL_COMPUTE_VERSION(3, 7, 0)
         case GDT_Int8:
         {
             using T = int8_t;
@@ -997,6 +975,7 @@ size_t extremum_element(const void *buffer, size_t nElts, GDALDataType eDT,
                 static_cast<const T *>(buffer), nElts, bHasNoData,
                 bHasNoData ? static_cast<T>(dfNoDataValue) : 0);
         }
+#endif
         case GDT_Byte:
         {
             using T = uint8_t;
@@ -1037,6 +1016,7 @@ size_t extremum_element(const void *buffer, size_t nElts, GDALDataType eDT,
                 static_cast<const T *>(buffer), nElts, bHasNoData,
                 bHasNoData ? static_cast<T>(dfNoDataValue) : 0);
         }
+#if GDAL_VERSION_NUM >= GDAL_COMPUTE_VERSION(3, 5, 0)
         case GDT_Int64:
         {
             using T = int64_t;
@@ -1053,6 +1033,7 @@ size_t extremum_element(const void *buffer, size_t nElts, GDALDataType eDT,
                 static_cast<const T *>(buffer), nElts, bHasNoData,
                 bHasNoData ? static_cast<T>(dfNoDataValue) : 0);
         }
+#endif
         case GDT_Float32:
         {
             using T = float;
@@ -1330,6 +1311,7 @@ inline std::pair<size_t, size_t> minmax_element(const void *buffer,
 {
     switch (eDT)
     {
+#if GDAL_VERSION_NUM >= GDAL_COMPUTE_VERSION(3, 7, 0)
         case GDT_Int8:
         {
             using T = int8_t;
@@ -1338,6 +1320,7 @@ inline std::pair<size_t, size_t> minmax_element(const void *buffer,
                 static_cast<const T *>(buffer), nElts, bHasNoData,
                 bHasNoData ? static_cast<T>(dfNoDataValue) : 0);
         }
+#endif
         case GDT_Byte:
         {
             using T = uint8_t;
@@ -1378,6 +1361,7 @@ inline std::pair<size_t, size_t> minmax_element(const void *buffer,
                 static_cast<const T *>(buffer), nElts, bHasNoData,
                 bHasNoData ? static_cast<T>(dfNoDataValue) : 0);
         }
+#if GDAL_VERSION_NUM >= GDAL_COMPUTE_VERSION(3, 5, 0)
         case GDT_Int64:
         {
             using T = int64_t;
@@ -1394,6 +1378,7 @@ inline std::pair<size_t, size_t> minmax_element(const void *buffer,
                 static_cast<const T *>(buffer), nElts, bHasNoData,
                 bHasNoData ? static_cast<T>(dfNoDataValue) : 0);
         }
+#endif
         case GDT_Float32:
         {
             using T = float;

From 0ef39fcfac948f75bfe09e95207fb483abc2de2c Mon Sep 17 00:00:00 2001
From: Even Rouault <even.rouault@spatialys.com>
Date: Mon, 4 Nov 2024 14:39:21 +0100
Subject: [PATCH 31/62] CI: check that GDAL_ENABLE_ARM_NEON_OPTIMIZATIONS is on
 on MacOSX ARM64 and Android targets

---
 .github/workflows/android_cmake/start.sh | 4 ++++
 ci/travis/osx/install.sh                 | 3 +++
 2 files changed, 7 insertions(+)

diff --git a/.github/workflows/android_cmake/start.sh b/.github/workflows/android_cmake/start.sh
index 4b275f01a3af..e7e6a090c4e1 100755
--- a/.github/workflows/android_cmake/start.sh
+++ b/.github/workflows/android_cmake/start.sh
@@ -88,6 +88,10 @@ PKG_CONFIG_LIBDIR=/tmp/install/lib/pkgconfig cmake .. \
  -DSFCGAL_CONFIG=disabled \
  -DHDF5_C_COMPILER_EXECUTABLE=disabled \
  -DHDF5_CXX_COMPILER_EXECUTABLE=disabled
+
+echo "Check that GDAL_ENABLE_ARM_NEON_OPTIMIZATIONS:BOOL=ON"
+(grep "GDAL_ENABLE_ARM_NEON_OPTIMIZATIONS:BOOL=ON" CMakeCache.txt > /dev/null && echo "yes") || (echo "Missing" && /bin/false)
+
 make -j$(nproc)
 make install
 cd ..
diff --git a/ci/travis/osx/install.sh b/ci/travis/osx/install.sh
index 00e392ae0633..95f435e7e052 100755
--- a/ci/travis/osx/install.sh
+++ b/ci/travis/osx/install.sh
@@ -27,6 +27,9 @@ CFLAGS="-Wextra -Werror" CXXFLAGS="-Wextra -Werror" cmake .. \
          -DBUILD_CSHARP_BINDINGS=OFF \
          -DCMAKE_UNITY_BUILD=ON
 
+echo "Check that GDAL_ENABLE_ARM_NEON_OPTIMIZATIONS:BOOL=ON"
+(grep "GDAL_ENABLE_ARM_NEON_OPTIMIZATIONS:BOOL=ON" CMakeCache.txt > /dev/null && echo "yes") || (echo "Missing" && /bin/false)
+
 NPROC=$(sysctl -n hw.ncpu)
 echo "NPROC=${NPROC}"
 make -j${NPROC}

From 71078a1d162619d761a7e012be8097ef2b665d8a Mon Sep 17 00:00:00 2001
From: Even Rouault <even.rouault@spatialys.com>
Date: Tue, 5 Nov 2024 23:01:19 +0100
Subject: [PATCH 32/62] third_party/fast_float: resync with upstream

---
 third_party/fast_float/PROVENANCE.TXT         |    2 +-
 third_party/fast_float/ascii_number.h         |  403 +++--
 third_party/fast_float/bigint.h               |  201 +--
 .../fast_float/constexpr_feature_detect.h     |   14 +-
 third_party/fast_float/decimal_to_binary.h    |  124 +-
 third_party/fast_float/digit_comparison.h     |  185 ++-
 third_party/fast_float/fast_float.h           |   58 +-
 third_party/fast_float/fast_table.h           | 1326 +++++++++--------
 third_party/fast_float/float_common.h         |  615 +++++---
 third_party/fast_float/parse_number.h         |  327 ++--
 10 files changed, 1854 insertions(+), 1401 deletions(-)

diff --git a/third_party/fast_float/PROVENANCE.TXT b/third_party/fast_float/PROVENANCE.TXT
index 0ea9d9ccd1d2..5a310b33ce6f 100644
--- a/third_party/fast_float/PROVENANCE.TXT
+++ b/third_party/fast_float/PROVENANCE.TXT
@@ -1,4 +1,4 @@
 https://github.com/fastfloat/fast_float
-Retrieved at commit https://github.com/fastfloat/fast_float/commit/a5ea2059295260922aa300d676a43a76b5e19a35
+Retrieved at commit https://github.com/fastfloat/fast_float/commit/9058831e6884e95358bcad29139a8b9d6cf0b534
 
 Using the MIT license choice.
diff --git a/third_party/fast_float/ascii_number.h b/third_party/fast_float/ascii_number.h
index d18e3d5360af..c027435e2a01 100644
--- a/third_party/fast_float/ascii_number.h
+++ b/third_party/fast_float/ascii_number.h
@@ -5,6 +5,7 @@
 #include <cstdint>
 #include <cstring>
 #include <iterator>
+#include <limits>
 #include <type_traits>
 
 #include "float_common.h"
@@ -19,8 +20,7 @@
 
 namespace fast_float {
 
-template <typename UC>
-fastfloat_really_inline constexpr bool has_simd_opt() {
+template <typename UC> fastfloat_really_inline constexpr bool has_simd_opt() {
 #ifdef FASTFLOAT_HAS_SIMD
   return std::is_same<UC, char16_t>::value;
 #else
@@ -36,24 +36,20 @@ fastfloat_really_inline constexpr bool is_integer(UC c) noexcept {
 }
 
 fastfloat_really_inline constexpr uint64_t byteswap(uint64_t val) {
-  return (val & 0xFF00000000000000) >> 56
-    | (val & 0x00FF000000000000) >> 40
-    | (val & 0x0000FF0000000000) >> 24
-    | (val & 0x000000FF00000000) >> 8
-    | (val & 0x00000000FF000000) << 8
-    | (val & 0x0000000000FF0000) << 24
-    | (val & 0x000000000000FF00) << 40
-    | (val & 0x00000000000000FF) << 56;
+  return (val & 0xFF00000000000000) >> 56 | (val & 0x00FF000000000000) >> 40 |
+         (val & 0x0000FF0000000000) >> 24 | (val & 0x000000FF00000000) >> 8 |
+         (val & 0x00000000FF000000) << 8 | (val & 0x0000000000FF0000) << 24 |
+         (val & 0x000000000000FF00) << 40 | (val & 0x00000000000000FF) << 56;
 }
 
 // Read 8 UC into a u64. Truncates UC if not char.
 template <typename UC>
-fastfloat_really_inline FASTFLOAT_CONSTEXPR20
-uint64_t read8_to_u64(const UC *chars) {
+fastfloat_really_inline FASTFLOAT_CONSTEXPR20 uint64_t
+read8_to_u64(const UC *chars) {
   if (cpp20_and_in_constexpr() || !std::is_same<UC, char>::value) {
     uint64_t val = 0;
-    for(int i = 0; i < 8; ++i) {
-      val |= uint64_t(uint8_t(*chars)) << (i*8);
+    for (int i = 0; i < 8; ++i) {
+      val |= uint64_t(uint8_t(*chars)) << (i * 8);
       ++chars;
     }
     return val;
@@ -69,44 +65,41 @@ uint64_t read8_to_u64(const UC *chars) {
 
 #ifdef FASTFLOAT_SSE2
 
-fastfloat_really_inline
-uint64_t simd_read8_to_u64(const __m128i data) {
-FASTFLOAT_SIMD_DISABLE_WARNINGS
+fastfloat_really_inline uint64_t simd_read8_to_u64(const __m128i data) {
+  FASTFLOAT_SIMD_DISABLE_WARNINGS
   const __m128i packed = _mm_packus_epi16(data, data);
 #ifdef FASTFLOAT_64BIT
   return uint64_t(_mm_cvtsi128_si64(packed));
 #else
   uint64_t value;
   // Visual Studio + older versions of GCC don't support _mm_storeu_si64
-  _mm_storel_epi64(reinterpret_cast<__m128i*>(&value), packed);
+  _mm_storel_epi64(reinterpret_cast<__m128i *>(&value), packed);
   return value;
 #endif
-FASTFLOAT_SIMD_RESTORE_WARNINGS
+  FASTFLOAT_SIMD_RESTORE_WARNINGS
 }
 
-fastfloat_really_inline
-uint64_t simd_read8_to_u64(const char16_t* chars) {
-FASTFLOAT_SIMD_DISABLE_WARNINGS
-  return simd_read8_to_u64(_mm_loadu_si128(reinterpret_cast<const __m128i*>(chars)));
-FASTFLOAT_SIMD_RESTORE_WARNINGS
+fastfloat_really_inline uint64_t simd_read8_to_u64(const char16_t *chars) {
+  FASTFLOAT_SIMD_DISABLE_WARNINGS
+  return simd_read8_to_u64(
+      _mm_loadu_si128(reinterpret_cast<const __m128i *>(chars)));
+  FASTFLOAT_SIMD_RESTORE_WARNINGS
 }
 
 #elif defined(FASTFLOAT_NEON)
 
-
-fastfloat_really_inline
-uint64_t simd_read8_to_u64(const uint16x8_t data) {
-FASTFLOAT_SIMD_DISABLE_WARNINGS
+fastfloat_really_inline uint64_t simd_read8_to_u64(const uint16x8_t data) {
+  FASTFLOAT_SIMD_DISABLE_WARNINGS
   uint8x8_t utf8_packed = vmovn_u16(data);
   return vget_lane_u64(vreinterpret_u64_u8(utf8_packed), 0);
-FASTFLOAT_SIMD_RESTORE_WARNINGS
+  FASTFLOAT_SIMD_RESTORE_WARNINGS
 }
 
-fastfloat_really_inline
-uint64_t simd_read8_to_u64(const char16_t* chars) {
-FASTFLOAT_SIMD_DISABLE_WARNINGS
-  return simd_read8_to_u64(vld1q_u16(reinterpret_cast<const uint16_t*>(chars)));
-FASTFLOAT_SIMD_RESTORE_WARNINGS
+fastfloat_really_inline uint64_t simd_read8_to_u64(const char16_t *chars) {
+  FASTFLOAT_SIMD_DISABLE_WARNINGS
+  return simd_read8_to_u64(
+      vld1q_u16(reinterpret_cast<const uint16_t *>(chars)));
+  FASTFLOAT_SIMD_RESTORE_WARNINGS
 }
 
 #endif // FASTFLOAT_SSE2
@@ -115,34 +108,16 @@ FASTFLOAT_SIMD_RESTORE_WARNINGS
 #if defined(_MSC_VER) && _MSC_VER <= 1900
 template <typename UC>
 #else
-template <typename UC, FASTFLOAT_ENABLE_IF(!has_simd_opt<UC>())>
+template <typename UC, FASTFLOAT_ENABLE_IF(!has_simd_opt<UC>()) = 0>
 #endif
 // dummy for compile
-uint64_t simd_read8_to_u64(UC const*) {
+uint64_t simd_read8_to_u64(UC const *) {
   return 0;
 }
 
-
-fastfloat_really_inline FASTFLOAT_CONSTEXPR20
-void write_u64(uint8_t *chars, uint64_t val) {
-  if (cpp20_and_in_constexpr()) {
-    for(int i = 0; i < 8; ++i) {
-      *chars = uint8_t(val);
-      val >>= 8;
-      ++chars;
-    }
-    return;
-  }
-#if FASTFLOAT_IS_BIG_ENDIAN == 1
-  // Need to read as-if the number was in little-endian order.
-  val = byteswap(val);
-#endif
-  ::memcpy(chars, &val, sizeof(uint64_t));
-}
-
 // credit  @aqrit
-fastfloat_really_inline FASTFLOAT_CONSTEXPR14
-uint32_t parse_eight_digits_unrolled(uint64_t val) {
+fastfloat_really_inline FASTFLOAT_CONSTEXPR14 uint32_t
+parse_eight_digits_unrolled(uint64_t val) {
   const uint64_t mask = 0x000000FF000000FF;
   const uint64_t mul1 = 0x000F424000000064; // 100 + (1000000ULL << 32)
   const uint64_t mul2 = 0x0000271000000001; // 1 + (10000ULL << 32)
@@ -152,38 +127,38 @@ uint32_t parse_eight_digits_unrolled(uint64_t val) {
   return uint32_t(val);
 }
 
-
 // Call this if chars are definitely 8 digits.
 template <typename UC>
-fastfloat_really_inline FASTFLOAT_CONSTEXPR20
-uint32_t parse_eight_digits_unrolled(UC const * chars)  noexcept {
+fastfloat_really_inline FASTFLOAT_CONSTEXPR20 uint32_t
+parse_eight_digits_unrolled(UC const *chars) noexcept {
   if (cpp20_and_in_constexpr() || !has_simd_opt<UC>()) {
     return parse_eight_digits_unrolled(read8_to_u64(chars)); // truncation okay
   }
   return parse_eight_digits_unrolled(simd_read8_to_u64(chars));
 }
 
-
 // credit @aqrit
-fastfloat_really_inline constexpr bool is_made_of_eight_digits_fast(uint64_t val)  noexcept {
+fastfloat_really_inline constexpr bool
+is_made_of_eight_digits_fast(uint64_t val) noexcept {
   return !((((val + 0x4646464646464646) | (val - 0x3030303030303030)) &
-     0x8080808080808080));
+            0x8080808080808080));
 }
 
-
 #ifdef FASTFLOAT_HAS_SIMD
 
 // Call this if chars might not be 8 digits.
-// Using this style (instead of is_made_of_eight_digits_fast() then parse_eight_digits_unrolled())
-// ensures we don't load SIMD registers twice.
-fastfloat_really_inline FASTFLOAT_CONSTEXPR20
-bool simd_parse_if_eight_digits_unrolled(const char16_t* chars, uint64_t& i) noexcept {
+// Using this style (instead of is_made_of_eight_digits_fast() then
+// parse_eight_digits_unrolled()) ensures we don't load SIMD registers twice.
+fastfloat_really_inline FASTFLOAT_CONSTEXPR20 bool
+simd_parse_if_eight_digits_unrolled(const char16_t *chars,
+                                    uint64_t &i) noexcept {
   if (cpp20_and_in_constexpr()) {
     return false;
-  }   
+  }
 #ifdef FASTFLOAT_SSE2
-FASTFLOAT_SIMD_DISABLE_WARNINGS
-  const __m128i data = _mm_loadu_si128(reinterpret_cast<const __m128i*>(chars));
+  FASTFLOAT_SIMD_DISABLE_WARNINGS
+  const __m128i data =
+      _mm_loadu_si128(reinterpret_cast<const __m128i *>(chars));
 
   // (x - '0') <= 9
   // http://0x80.pl/articles/simd-parsing-int-sequences.html
@@ -193,13 +168,13 @@ FASTFLOAT_SIMD_DISABLE_WARNINGS
   if (_mm_movemask_epi8(t1) == 0) {
     i = i * 100000000 + parse_eight_digits_unrolled(simd_read8_to_u64(data));
     return true;
-  }
-  else return false;
-FASTFLOAT_SIMD_RESTORE_WARNINGS
+  } else
+    return false;
+  FASTFLOAT_SIMD_RESTORE_WARNINGS
 #elif defined(FASTFLOAT_NEON)
-FASTFLOAT_SIMD_DISABLE_WARNINGS
-  const uint16x8_t data = vld1q_u16(reinterpret_cast<const uint16_t*>(chars));
-  
+  FASTFLOAT_SIMD_DISABLE_WARNINGS
+  const uint16x8_t data = vld1q_u16(reinterpret_cast<const uint16_t *>(chars));
+
   // (x - '0') <= 9
   // http://0x80.pl/articles/simd-parsing-int-sequences.html
   const uint16x8_t t0 = vsubq_u16(data, vmovq_n_u16('0'));
@@ -208,11 +183,12 @@ FASTFLOAT_SIMD_DISABLE_WARNINGS
   if (vminvq_u16(mask) == 0xFFFF) {
     i = i * 100000000 + parse_eight_digits_unrolled(simd_read8_to_u64(data));
     return true;
-  }
-  else return false;
-FASTFLOAT_SIMD_RESTORE_WARNINGS
+  } else
+    return false;
+  FASTFLOAT_SIMD_RESTORE_WARNINGS
 #else
-  (void)chars; (void)i;
+  (void)chars;
+  (void)i;
   return false;
 #endif // FASTFLOAT_SSE2
 }
@@ -223,55 +199,90 @@ FASTFLOAT_SIMD_RESTORE_WARNINGS
 #if defined(_MSC_VER) && _MSC_VER <= 1900
 template <typename UC>
 #else
-template <typename UC, FASTFLOAT_ENABLE_IF(!has_simd_opt<UC>())>
+template <typename UC, FASTFLOAT_ENABLE_IF(!has_simd_opt<UC>()) = 0>
 #endif
 // dummy for compile
-bool simd_parse_if_eight_digits_unrolled(UC const*, uint64_t&) {
+bool simd_parse_if_eight_digits_unrolled(UC const *, uint64_t &) {
   return 0;
 }
 
-
-template <typename UC, FASTFLOAT_ENABLE_IF(!std::is_same<UC, char>::value)>
-fastfloat_really_inline FASTFLOAT_CONSTEXPR20
-void loop_parse_if_eight_digits(const UC*& p, const UC* const pend, uint64_t& i) {
+template <typename UC, FASTFLOAT_ENABLE_IF(!std::is_same<UC, char>::value) = 0>
+fastfloat_really_inline FASTFLOAT_CONSTEXPR20 void
+loop_parse_if_eight_digits(const UC *&p, const UC *const pend, uint64_t &i) {
   if (!has_simd_opt<UC>()) {
     return;
   }
-  while ((std::distance(p, pend) >= 8) && simd_parse_if_eight_digits_unrolled(p, i)) { // in rare cases, this will overflow, but that's ok
+  while ((std::distance(p, pend) >= 8) &&
+         simd_parse_if_eight_digits_unrolled(
+             p, i)) { // in rare cases, this will overflow, but that's ok
     p += 8;
   }
 }
 
-fastfloat_really_inline FASTFLOAT_CONSTEXPR20
-void loop_parse_if_eight_digits(const char*& p, const char* const pend, uint64_t& i) {
+fastfloat_really_inline FASTFLOAT_CONSTEXPR20 void
+loop_parse_if_eight_digits(const char *&p, const char *const pend,
+                           uint64_t &i) {
   // optimizes better than parse_if_eight_digits_unrolled() for UC = char.
-  while ((std::distance(p, pend) >= 8) && is_made_of_eight_digits_fast(read8_to_u64(p))) {
-    i = i * 100000000 + parse_eight_digits_unrolled(read8_to_u64(p)); // in rare cases, this will overflow, but that's ok
+  while ((std::distance(p, pend) >= 8) &&
+         is_made_of_eight_digits_fast(read8_to_u64(p))) {
+    i = i * 100000000 +
+        parse_eight_digits_unrolled(read8_to_u64(
+            p)); // in rare cases, this will overflow, but that's ok
     p += 8;
   }
 }
 
-template <typename UC>
-struct parsed_number_string_t {
+enum class parse_error {
+  no_error,
+  // [JSON-only] The minus sign must be followed by an integer.
+  missing_integer_after_sign,
+  // A sign must be followed by an integer or dot.
+  missing_integer_or_dot_after_sign,
+  // [JSON-only] The integer part must not have leading zeros.
+  leading_zeros_in_integer_part,
+  // [JSON-only] The integer part must have at least one digit.
+  no_digits_in_integer_part,
+  // [JSON-only] If there is a decimal point, there must be digits in the
+  // fractional part.
+  no_digits_in_fractional_part,
+  // The mantissa must have at least one digit.
+  no_digits_in_mantissa,
+  // Scientific notation requires an exponential part.
+  missing_exponential_part,
+};
+
+template <typename UC> struct parsed_number_string_t {
   int64_t exponent{0};
   uint64_t mantissa{0};
-  UC const * lastmatch{nullptr};
+  UC const *lastmatch{nullptr};
   bool negative{false};
   bool valid{false};
   bool too_many_digits{false};
   // contains the range of the significant digits
   span<const UC> integer{};  // non-nullable
   span<const UC> fraction{}; // nullable
+  parse_error error{parse_error::no_error};
 };
 
 using byte_span = span<const char>;
 using parsed_number_string = parsed_number_string_t<char>;
 
+template <typename UC>
+fastfloat_really_inline FASTFLOAT_CONSTEXPR20 parsed_number_string_t<UC>
+report_parse_error(UC const *p, parse_error error) {
+  parsed_number_string_t<UC> answer;
+  answer.valid = false;
+  answer.lastmatch = p;
+  answer.error = error;
+  return answer;
+}
+
 // Assuming that you use no more than 19 digits, this will
 // parse an ASCII string.
 template <typename UC>
-fastfloat_really_inline FASTFLOAT_CONSTEXPR20
-parsed_number_string_t<UC> parse_number_string(UC const *p, UC const * pend, parse_options_t<UC> options) noexcept {
+fastfloat_really_inline FASTFLOAT_CONSTEXPR20 parsed_number_string_t<UC>
+parse_number_string(UC const *p, UC const *pend,
+                    parse_options_t<UC> options) noexcept {
   chars_format const fmt = options.format;
   UC const decimal_point = options.decimal_point;
 
@@ -286,19 +297,24 @@ parsed_number_string_t<UC> parse_number_string(UC const *p, UC const * pend, par
 #endif
     ++p;
     if (p == pend) {
-      return answer;
+      return report_parse_error<UC>(
+          p, parse_error::missing_integer_or_dot_after_sign);
     }
     if (fmt & FASTFLOAT_JSONFMT) {
       if (!is_integer(*p)) { // a sign must be followed by an integer
-        return answer;
-      }    
+        return report_parse_error<UC>(p,
+                                      parse_error::missing_integer_after_sign);
+      }
     } else {
-      if (!is_integer(*p) && (*p != decimal_point)) { // a sign must be followed by an integer or the dot
-        return answer;
+      if (!is_integer(*p) &&
+          (*p !=
+           decimal_point)) { // a sign must be followed by an integer or the dot
+        return report_parse_error<UC>(
+            p, parse_error::missing_integer_or_dot_after_sign);
       }
     }
   }
-  UC const * const start_digits = p;
+  UC const *const start_digits = p;
 
   uint64_t i = 0; // an unsigned int avoids signed overflows (which are bad)
 
@@ -306,16 +322,21 @@ parsed_number_string_t<UC> parse_number_string(UC const *p, UC const * pend, par
     // a multiplication by 10 is cheaper than an arbitrary integer
     // multiplication
     i = 10 * i +
-        uint64_t(*p - UC('0')); // might overflow, we will handle the overflow later
+        uint64_t(*p -
+                 UC('0')); // might overflow, we will handle the overflow later
     ++p;
   }
-  UC const * const end_of_integer_part = p;
+  UC const *const end_of_integer_part = p;
   int64_t digit_count = int64_t(end_of_integer_part - start_digits);
   answer.integer = span<const UC>(start_digits, size_t(digit_count));
   if (fmt & FASTFLOAT_JSONFMT) {
     // at least 1 digit in integer part, without leading zeros
-    if (digit_count == 0 || (start_digits[0] == UC('0') && digit_count > 1)) {
-      return answer;
+    if (digit_count == 0) {
+      return report_parse_error<UC>(p, parse_error::no_digits_in_integer_part);
+    }
+    if ((start_digits[0] == UC('0') && digit_count > 1)) {
+      return report_parse_error<UC>(start_digits,
+                                    parse_error::leading_zeros_in_integer_part);
     }
   }
 
@@ -323,7 +344,7 @@ parsed_number_string_t<UC> parse_number_string(UC const *p, UC const * pend, par
   const bool has_decimal_point = (p != pend) && (*p == decimal_point);
   if (has_decimal_point) {
     ++p;
-    UC const * before = p;
+    UC const *before = p;
     // can occur at most twice without overflowing, but let it occur more, since
     // for integers with many digits, digit parsing is the primary bottleneck.
     loop_parse_if_eight_digits(p, pend, i);
@@ -340,35 +361,39 @@ parsed_number_string_t<UC> parse_number_string(UC const *p, UC const * pend, par
   if (fmt & FASTFLOAT_JSONFMT) {
     // at least 1 digit in fractional part
     if (has_decimal_point && exponent == 0) {
-      return answer;
+      return report_parse_error<UC>(p,
+                                    parse_error::no_digits_in_fractional_part);
     }
-  } 
-  else if (digit_count == 0) { // we must have encountered at least one integer!
-    return answer;
+  } else if (digit_count ==
+             0) { // we must have encountered at least one integer!
+    return report_parse_error<UC>(p, parse_error::no_digits_in_mantissa);
   }
-  int64_t exp_number = 0;            // explicit exponential part
-  if ( ((fmt & chars_format::scientific) &&
-        (p != pend) &&
-        ((UC('e') == *p) || (UC('E') == *p)))
-       ||
-       ((fmt & FASTFLOAT_FORTRANFMT) &&
-        (p != pend) &&
-        ((UC('+') == *p) || (UC('-') == *p) || (UC('d') == *p) || (UC('D') == *p)))) {
-    UC const * location_of_e = p;
-    if ((UC('e') == *p) || (UC('E') == *p) || (UC('d') == *p) || (UC('D') == *p)) {
+  int64_t exp_number = 0; // explicit exponential part
+  if (((fmt & chars_format::scientific) && (p != pend) &&
+       ((UC('e') == *p) || (UC('E') == *p))) ||
+      ((fmt & FASTFLOAT_FORTRANFMT) && (p != pend) &&
+       ((UC('+') == *p) || (UC('-') == *p) || (UC('d') == *p) ||
+        (UC('D') == *p)))) {
+    UC const *location_of_e = p;
+    if ((UC('e') == *p) || (UC('E') == *p) || (UC('d') == *p) ||
+        (UC('D') == *p)) {
       ++p;
     }
     bool neg_exp = false;
     if ((p != pend) && (UC('-') == *p)) {
       neg_exp = true;
       ++p;
-    } else if ((p != pend) && (UC('+') == *p)) { // '+' on exponent is allowed by C++17 20.19.3.(7.1)
+    } else if ((p != pend) &&
+               (UC('+') ==
+                *p)) { // '+' on exponent is allowed by C++17 20.19.3.(7.1)
       ++p;
     }
     if ((p == pend) || !is_integer(*p)) {
-      if(!(fmt & chars_format::fixed)) {
-        // We are in error.
-        return answer;
+      if (!(fmt & chars_format::fixed)) {
+        // The exponential part is invalid for scientific notation, so it must
+        // be a trailing token for fixed notation. However, fixed notation is
+        // disabled, so report a scientific notation error.
+        return report_parse_error<UC>(p, parse_error::missing_exponential_part);
       }
       // Otherwise, we will be ignoring the 'e'.
       p = location_of_e;
@@ -380,12 +405,16 @@ parsed_number_string_t<UC> parse_number_string(UC const *p, UC const * pend, par
         }
         ++p;
       }
-      if(neg_exp) { exp_number = - exp_number; }
+      if (neg_exp) {
+        exp_number = -exp_number;
+      }
       exponent += exp_number;
     }
   } else {
     // If it scientific and not fixed, we have to bail out.
-    if((fmt & chars_format::scientific) && !(fmt & chars_format::fixed)) { return answer; }
+    if ((fmt & chars_format::scientific) && !(fmt & chars_format::fixed)) {
+      return report_parse_error<UC>(p, parse_error::missing_exponential_part);
+    }
   }
   answer.lastmatch = p;
   answer.valid = true;
@@ -400,9 +429,11 @@ parsed_number_string_t<UC> parse_number_string(UC const *p, UC const * pend, par
     // We have to handle the case where we have 0.0000somenumber.
     // We need to be mindful of the case where we only have zeroes...
     // E.g., 0.000000000...000.
-    UC const * start = start_digits;
+    UC const *start = start_digits;
     while ((start != pend) && (*start == UC('0') || *start == decimal_point)) {
-      if(*start == UC('0')) { digit_count --; }
+      if (*start == UC('0')) {
+        digit_count--;
+      }
       start++;
     }
 
@@ -413,18 +444,17 @@ parsed_number_string_t<UC> parse_number_string(UC const *p, UC const * pend, par
       // pre-tokenized spans from above.
       i = 0;
       p = answer.integer.ptr;
-      UC const* int_end = p + answer.integer.len();
-      const uint64_t minimal_nineteen_digit_integer{ 1000000000000000000 };
+      UC const *int_end = p + answer.integer.len();
+      const uint64_t minimal_nineteen_digit_integer{1000000000000000000};
       while ((i < minimal_nineteen_digit_integer) && (p != int_end)) {
         i = i * 10 + uint64_t(*p - UC('0'));
         ++p;
       }
       if (i >= minimal_nineteen_digit_integer) { // We have a big integers
         exponent = end_of_integer_part - p + exp_number;
-      }
-      else { // We have a value with a fractional component.
+      } else { // We have a value with a fractional component.
         p = answer.fraction.ptr;
-        UC const* frac_end = p + answer.fraction.len();
+        UC const *frac_end = p + answer.fraction.len();
         while ((i < minimal_nineteen_digit_integer) && (p != frac_end)) {
           i = i * 10 + uint64_t(*p - UC('0'));
           ++p;
@@ -439,6 +469,111 @@ parsed_number_string_t<UC> parse_number_string(UC const *p, UC const * pend, par
   return answer;
 }
 
+template <typename T, typename UC>
+fastfloat_really_inline FASTFLOAT_CONSTEXPR20 from_chars_result_t<UC>
+parse_int_string(UC const *p, UC const *pend, T &value, int base) {
+  from_chars_result_t<UC> answer;
+
+  UC const *const first = p;
+
+  bool negative = (*p == UC('-'));
+  if (!std::is_signed<T>::value && negative) {
+    answer.ec = std::errc::invalid_argument;
+    answer.ptr = first;
+    return answer;
+  }
+#ifdef FASTFLOAT_ALLOWS_LEADING_PLUS // disabled by default
+  if ((*p == UC('-')) || (*p == UC('+'))) {
+#else
+  if (*p == UC('-')) {
+#endif
+    ++p;
+  }
+
+  UC const *const start_num = p;
+
+  while (p != pend && *p == UC('0')) {
+    ++p;
+  }
+
+  const bool has_leading_zeros = p > start_num;
+
+  UC const *const start_digits = p;
+
+  uint64_t i = 0;
+  if (base == 10) {
+    loop_parse_if_eight_digits(p, pend, i); // use SIMD if possible
+  }
+  while (p != pend) {
+    uint8_t digit = ch_to_digit(*p);
+    if (digit >= base) {
+      break;
+    }
+    i = uint64_t(base) * i + digit; // might overflow, check this later
+    p++;
+  }
+
+  size_t digit_count = size_t(p - start_digits);
+
+  if (digit_count == 0) {
+    if (has_leading_zeros) {
+      value = 0;
+      answer.ec = std::errc();
+      answer.ptr = p;
+    } else {
+      answer.ec = std::errc::invalid_argument;
+      answer.ptr = first;
+    }
+    return answer;
+  }
+
+  answer.ptr = p;
+
+  // check u64 overflow
+  size_t max_digits = max_digits_u64(base);
+  if (digit_count > max_digits) {
+    answer.ec = std::errc::result_out_of_range;
+    return answer;
+  }
+  // this check can be eliminated for all other types, but they will all require
+  // a max_digits(base) equivalent
+  if (digit_count == max_digits && i < min_safe_u64(base)) {
+    answer.ec = std::errc::result_out_of_range;
+    return answer;
+  }
+
+  // check other types overflow
+  if (!std::is_same<T, uint64_t>::value) {
+    if (i > uint64_t(std::numeric_limits<T>::max()) + uint64_t(negative)) {
+      answer.ec = std::errc::result_out_of_range;
+      return answer;
+    }
+  }
+
+  if (negative) {
+#ifdef FASTFLOAT_VISUAL_STUDIO
+#pragma warning(push)
+#pragma warning(disable : 4146)
+#endif
+    // this weird workaround is required because:
+    // - converting unsigned to signed when its value is greater than signed max
+    // is UB pre-C++23.
+    // - reinterpret_casting (~i + 1) would work, but it is not constexpr
+    // this is always optimized into a neg instruction (note: T is an integer
+    // type)
+    value = T(-std::numeric_limits<T>::max() -
+              T(i - uint64_t(std::numeric_limits<T>::max())));
+#ifdef FASTFLOAT_VISUAL_STUDIO
+#pragma warning(pop)
+#endif
+  } else {
+    value = T(i);
+  }
+
+  answer.ec = std::errc();
+  return answer;
+}
+
 } // namespace fast_float
 
 #endif
diff --git a/third_party/fast_float/bigint.h b/third_party/fast_float/bigint.h
index 5076b47cc5c9..03a5caa4a532 100644
--- a/third_party/fast_float/bigint.h
+++ b/third_party/fast_float/bigint.h
@@ -37,8 +37,7 @@ constexpr size_t bigint_limbs = bigint_bits / limb_bits;
 
 // vector-like type that is allocated on the stack. the entire
 // buffer is pre-allocated, and only the length changes.
-template <uint16_t size>
-struct stackvec {
+template <uint16_t size> struct stackvec {
   limb data[size];
   // we never need more than 150 limbs
   uint16_t length{0};
@@ -54,16 +53,16 @@ struct stackvec {
     FASTFLOAT_ASSERT(try_extend(s));
   }
 
-  FASTFLOAT_CONSTEXPR14 limb& operator[](size_t index) noexcept {
+  FASTFLOAT_CONSTEXPR14 limb &operator[](size_t index) noexcept {
     FASTFLOAT_DEBUG_ASSERT(index < length);
     return data[index];
   }
-  FASTFLOAT_CONSTEXPR14 const limb& operator[](size_t index) const noexcept {
+  FASTFLOAT_CONSTEXPR14 const limb &operator[](size_t index) const noexcept {
     FASTFLOAT_DEBUG_ASSERT(index < length);
     return data[index];
   }
   // index from the end of the container
-  FASTFLOAT_CONSTEXPR14 const limb& rindex(size_t index) const noexcept {
+  FASTFLOAT_CONSTEXPR14 const limb &rindex(size_t index) const noexcept {
     FASTFLOAT_DEBUG_ASSERT(index < length);
     size_t rindex = length - index - 1;
     return data[rindex];
@@ -73,15 +72,9 @@ struct stackvec {
   FASTFLOAT_CONSTEXPR14 void set_len(size_t len) noexcept {
     length = uint16_t(len);
   }
-  constexpr size_t len() const noexcept {
-    return length;
-  }
-  constexpr bool is_empty() const noexcept {
-    return length == 0;
-  }
-  constexpr size_t capacity() const noexcept {
-    return size;
-  }
+  constexpr size_t len() const noexcept { return length; }
+  constexpr bool is_empty() const noexcept { return length == 0; }
+  constexpr size_t capacity() const noexcept { return size; }
   // append item to vector, without bounds checking
   FASTFLOAT_CONSTEXPR14 void push_unchecked(limb value) noexcept {
     data[length] = value;
@@ -98,7 +91,7 @@ struct stackvec {
   }
   // add items to the vector, from a span, without bounds checking
   FASTFLOAT_CONSTEXPR20 void extend_unchecked(limb_span s) noexcept {
-    limb* ptr = data + length;
+    limb *ptr = data + length;
     std::copy_n(s.ptr, s.len(), ptr);
     set_len(len() + s.len());
   }
@@ -118,8 +111,8 @@ struct stackvec {
   void resize_unchecked(size_t new_len, limb value) noexcept {
     if (new_len > len()) {
       size_t count = new_len - len();
-      limb* first = data + len();
-      limb* last = first + count;
+      limb *first = data + len();
+      limb *last = first + count;
       ::std::fill(first, last, value);
       set_len(new_len);
     } else {
@@ -155,21 +148,21 @@ struct stackvec {
   }
 };
 
-fastfloat_really_inline FASTFLOAT_CONSTEXPR14
-uint64_t empty_hi64(bool& truncated) noexcept {
+fastfloat_really_inline FASTFLOAT_CONSTEXPR14 uint64_t
+empty_hi64(bool &truncated) noexcept {
   truncated = false;
   return 0;
 }
 
-fastfloat_really_inline FASTFLOAT_CONSTEXPR20
-uint64_t uint64_hi64(uint64_t r0, bool& truncated) noexcept {
+fastfloat_really_inline FASTFLOAT_CONSTEXPR20 uint64_t
+uint64_hi64(uint64_t r0, bool &truncated) noexcept {
   truncated = false;
   int shl = leading_zeroes(r0);
   return r0 << shl;
 }
 
-fastfloat_really_inline FASTFLOAT_CONSTEXPR20
-uint64_t uint64_hi64(uint64_t r0, uint64_t r1, bool& truncated) noexcept {
+fastfloat_really_inline FASTFLOAT_CONSTEXPR20 uint64_t
+uint64_hi64(uint64_t r0, uint64_t r1, bool &truncated) noexcept {
   int shl = leading_zeroes(r0);
   if (shl == 0) {
     truncated = r1 != 0;
@@ -181,20 +174,20 @@ uint64_t uint64_hi64(uint64_t r0, uint64_t r1, bool& truncated) noexcept {
   }
 }
 
-fastfloat_really_inline FASTFLOAT_CONSTEXPR20
-uint64_t uint32_hi64(uint32_t r0, bool& truncated) noexcept {
+fastfloat_really_inline FASTFLOAT_CONSTEXPR20 uint64_t
+uint32_hi64(uint32_t r0, bool &truncated) noexcept {
   return uint64_hi64(r0, truncated);
 }
 
-fastfloat_really_inline FASTFLOAT_CONSTEXPR20
-uint64_t uint32_hi64(uint32_t r0, uint32_t r1, bool& truncated) noexcept {
+fastfloat_really_inline FASTFLOAT_CONSTEXPR20 uint64_t
+uint32_hi64(uint32_t r0, uint32_t r1, bool &truncated) noexcept {
   uint64_t x0 = r0;
   uint64_t x1 = r1;
   return uint64_hi64((x0 << 32) | x1, truncated);
 }
 
-fastfloat_really_inline FASTFLOAT_CONSTEXPR20
-uint64_t uint32_hi64(uint32_t r0, uint32_t r1, uint32_t r2, bool& truncated) noexcept {
+fastfloat_really_inline FASTFLOAT_CONSTEXPR20 uint64_t
+uint32_hi64(uint32_t r0, uint32_t r1, uint32_t r2, bool &truncated) noexcept {
   uint64_t x0 = r0;
   uint64_t x1 = r1;
   uint64_t x2 = r2;
@@ -205,17 +198,17 @@ uint64_t uint32_hi64(uint32_t r0, uint32_t r1, uint32_t r2, bool& truncated) noe
 // we want an efficient operation. for msvc, where
 // we don't have built-in intrinsics, this is still
 // pretty fast.
-fastfloat_really_inline FASTFLOAT_CONSTEXPR20
-limb scalar_add(limb x, limb y, bool& overflow) noexcept {
+fastfloat_really_inline FASTFLOAT_CONSTEXPR20 limb
+scalar_add(limb x, limb y, bool &overflow) noexcept {
   limb z;
 // gcc and clang
 #if defined(__has_builtin)
-  #if __has_builtin(__builtin_add_overflow)
-    if (!cpp20_and_in_constexpr()) {
-      overflow = __builtin_add_overflow(x, y, &z);
-      return z;
-    }
-  #endif
+#if __has_builtin(__builtin_add_overflow)
+  if (!cpp20_and_in_constexpr()) {
+    overflow = __builtin_add_overflow(x, y, &z);
+    return z;
+  }
+#endif
 #endif
 
   // generic, this still optimizes correctly on MSVC.
@@ -225,24 +218,24 @@ limb scalar_add(limb x, limb y, bool& overflow) noexcept {
 }
 
 // multiply two small integers, getting both the high and low bits.
-fastfloat_really_inline FASTFLOAT_CONSTEXPR20
-limb scalar_mul(limb x, limb y, limb& carry) noexcept {
+fastfloat_really_inline FASTFLOAT_CONSTEXPR20 limb
+scalar_mul(limb x, limb y, limb &carry) noexcept {
 #ifdef FASTFLOAT_64BIT_LIMB
-  #if defined(__SIZEOF_INT128__)
+#if defined(__SIZEOF_INT128__)
   // GCC and clang both define it as an extension.
   __uint128_t z = __uint128_t(x) * __uint128_t(y) + __uint128_t(carry);
   carry = limb(z >> limb_bits);
   return limb(z);
-  #else
+#else
   // fallback, no native 128-bit integer multiplication with carry.
   // on msvc, this optimizes identically, somehow.
   value128 z = full_multiplication(x, y);
   bool overflow;
   z.low = scalar_add(z.low, carry, overflow);
-  z.high += uint64_t(overflow);  // cannot overflow
+  z.high += uint64_t(overflow); // cannot overflow
   carry = z.high;
   return z.low;
-  #endif
+#endif
 #else
   uint64_t z = uint64_t(x) * uint64_t(y) + uint64_t(carry);
   carry = limb(z >> limb_bits);
@@ -253,8 +246,8 @@ limb scalar_mul(limb x, limb y, limb& carry) noexcept {
 // add scalar value to bigint starting from offset.
 // used in grade school multiplication
 template <uint16_t size>
-inline FASTFLOAT_CONSTEXPR20
-bool small_add_from(stackvec<size>& vec, limb y, size_t start) noexcept {
+inline FASTFLOAT_CONSTEXPR20 bool small_add_from(stackvec<size> &vec, limb y,
+                                                 size_t start) noexcept {
   size_t index = start;
   limb carry = y;
   bool overflow;
@@ -271,15 +264,15 @@ bool small_add_from(stackvec<size>& vec, limb y, size_t start) noexcept {
 
 // add scalar value to bigint.
 template <uint16_t size>
-fastfloat_really_inline FASTFLOAT_CONSTEXPR20
-bool small_add(stackvec<size>& vec, limb y) noexcept {
+fastfloat_really_inline FASTFLOAT_CONSTEXPR20 bool
+small_add(stackvec<size> &vec, limb y) noexcept {
   return small_add_from(vec, y, 0);
 }
 
 // multiply bigint by scalar value.
 template <uint16_t size>
-inline FASTFLOAT_CONSTEXPR20
-bool small_mul(stackvec<size>& vec, limb y) noexcept {
+inline FASTFLOAT_CONSTEXPR20 bool small_mul(stackvec<size> &vec,
+                                            limb y) noexcept {
   limb carry = 0;
   for (size_t index = 0; index < vec.len(); index++) {
     vec[index] = scalar_mul(vec[index], y, carry);
@@ -293,12 +286,12 @@ bool small_mul(stackvec<size>& vec, limb y) noexcept {
 // add bigint to bigint starting from index.
 // used in grade school multiplication
 template <uint16_t size>
-FASTFLOAT_CONSTEXPR20
-bool large_add_from(stackvec<size>& x, limb_span y, size_t start) noexcept {
+FASTFLOAT_CONSTEXPR20 bool large_add_from(stackvec<size> &x, limb_span y,
+                                          size_t start) noexcept {
   // the effective x buffer is from `xstart..x.len()`, so exit early
   // if we can't get that current range.
   if (x.len() < start || y.len() > x.len() - start) {
-      FASTFLOAT_TRY(x.try_resize(y.len() + start, 0));
+    FASTFLOAT_TRY(x.try_resize(y.len() + start, 0));
   }
 
   bool carry = false;
@@ -324,15 +317,14 @@ bool large_add_from(stackvec<size>& x, limb_span y, size_t start) noexcept {
 
 // add bigint to bigint.
 template <uint16_t size>
-fastfloat_really_inline FASTFLOAT_CONSTEXPR20
-bool large_add_from(stackvec<size>& x, limb_span y) noexcept {
+fastfloat_really_inline FASTFLOAT_CONSTEXPR20 bool
+large_add_from(stackvec<size> &x, limb_span y) noexcept {
   return large_add_from(x, y, 0);
 }
 
 // grade-school multiplication algorithm
 template <uint16_t size>
-FASTFLOAT_CONSTEXPR20
-bool long_mul(stackvec<size>& x, limb_span y) noexcept {
+FASTFLOAT_CONSTEXPR20 bool long_mul(stackvec<size> &x, limb_span y) noexcept {
   limb_span xs = limb_span(x.data, x.len());
   stackvec<size> z(xs);
   limb_span zs = limb_span(z.data, z.len());
@@ -360,8 +352,7 @@ bool long_mul(stackvec<size>& x, limb_span y) noexcept {
 
 // grade-school multiplication algorithm
 template <uint16_t size>
-FASTFLOAT_CONSTEXPR20
-bool large_mul(stackvec<size>& x, limb_span y) noexcept {
+FASTFLOAT_CONSTEXPR20 bool large_mul(stackvec<size> &x, limb_span y) noexcept {
   if (y.len() == 1) {
     FASTFLOAT_TRY(small_mul(x, y[0]));
   } else {
@@ -370,36 +361,58 @@ bool large_mul(stackvec<size>& x, limb_span y) noexcept {
   return true;
 }
 
-template <typename = void>
-struct pow5_tables {
+template <typename = void> struct pow5_tables {
   static constexpr uint32_t large_step = 135;
   static constexpr uint64_t small_power_of_5[] = {
-    1UL, 5UL, 25UL, 125UL, 625UL, 3125UL, 15625UL, 78125UL, 390625UL,
-    1953125UL, 9765625UL, 48828125UL, 244140625UL, 1220703125UL,
-    6103515625UL, 30517578125UL, 152587890625UL, 762939453125UL,
-    3814697265625UL, 19073486328125UL, 95367431640625UL, 476837158203125UL,
-    2384185791015625UL, 11920928955078125UL, 59604644775390625UL,
-    298023223876953125UL, 1490116119384765625UL, 7450580596923828125UL,
+      1UL,
+      5UL,
+      25UL,
+      125UL,
+      625UL,
+      3125UL,
+      15625UL,
+      78125UL,
+      390625UL,
+      1953125UL,
+      9765625UL,
+      48828125UL,
+      244140625UL,
+      1220703125UL,
+      6103515625UL,
+      30517578125UL,
+      152587890625UL,
+      762939453125UL,
+      3814697265625UL,
+      19073486328125UL,
+      95367431640625UL,
+      476837158203125UL,
+      2384185791015625UL,
+      11920928955078125UL,
+      59604644775390625UL,
+      298023223876953125UL,
+      1490116119384765625UL,
+      7450580596923828125UL,
   };
 #ifdef FASTFLOAT_64BIT_LIMB
   constexpr static limb large_power_of_5[] = {
-    1414648277510068013UL, 9180637584431281687UL, 4539964771860779200UL,
-    10482974169319127550UL, 198276706040285095UL};
+      1414648277510068013UL, 9180637584431281687UL, 4539964771860779200UL,
+      10482974169319127550UL, 198276706040285095UL};
 #else
   constexpr static limb large_power_of_5[] = {
-    4279965485U, 329373468U, 4020270615U, 2137533757U, 4287402176U,
-    1057042919U, 1071430142U, 2440757623U, 381945767U, 46164893U};
+      4279965485U, 329373468U,  4020270615U, 2137533757U, 4287402176U,
+      1057042919U, 1071430142U, 2440757623U, 381945767U,  46164893U};
 #endif
 };
 
-template <typename T>
-constexpr uint32_t pow5_tables<T>::large_step;
+#if FASTFLOAT_DETAIL_MUST_DEFINE_CONSTEXPR_VARIABLE
+
+template <typename T> constexpr uint32_t pow5_tables<T>::large_step;
 
-template <typename T>
-constexpr uint64_t pow5_tables<T>::small_power_of_5[];
+template <typename T> constexpr uint64_t pow5_tables<T>::small_power_of_5[];
 
-template <typename T>
-constexpr limb pow5_tables<T>::large_power_of_5[];
+template <typename T> constexpr limb pow5_tables<T>::large_power_of_5[];
+
+#endif
 
 // big integer type. implements a small subset of big integer
 // arithmetic, using simple algorithms since asymptotically
@@ -409,13 +422,13 @@ struct bigint : pow5_tables<> {
   // storage of the limbs, in little-endian order.
   stackvec<bigint_limbs> vec;
 
-  FASTFLOAT_CONSTEXPR20 bigint(): vec() {}
+  FASTFLOAT_CONSTEXPR20 bigint() : vec() {}
   bigint(const bigint &) = delete;
   bigint &operator=(const bigint &) = delete;
   bigint(bigint &&) = delete;
   bigint &operator=(bigint &&other) = delete;
 
-  FASTFLOAT_CONSTEXPR20 bigint(uint64_t value): vec() {
+  FASTFLOAT_CONSTEXPR20 bigint(uint64_t value) : vec() {
 #ifdef FASTFLOAT_64BIT_LIMB
     vec.push_unchecked(value);
 #else
@@ -427,7 +440,7 @@ struct bigint : pow5_tables<> {
 
   // get the high 64 bits from the vector, and if bits were truncated.
   // this is to get the significant digits for the float.
-  FASTFLOAT_CONSTEXPR20 uint64_t hi64(bool& truncated) const noexcept {
+  FASTFLOAT_CONSTEXPR20 uint64_t hi64(bool &truncated) const noexcept {
 #ifdef FASTFLOAT_64BIT_LIMB
     if (vec.len() == 0) {
       return empty_hi64(truncated);
@@ -446,7 +459,8 @@ struct bigint : pow5_tables<> {
     } else if (vec.len() == 2) {
       return uint32_hi64(vec.rindex(0), vec.rindex(1), truncated);
     } else {
-      uint64_t result = uint32_hi64(vec.rindex(0), vec.rindex(1), vec.rindex(2), truncated);
+      uint64_t result =
+          uint32_hi64(vec.rindex(0), vec.rindex(1), vec.rindex(2), truncated);
       truncated |= vec.nonzero(3);
       return result;
     }
@@ -459,7 +473,7 @@ struct bigint : pow5_tables<> {
   // positive, this is larger, otherwise they are equal.
   // the limbs are stored in little-endian order, so we
   // must compare the limbs in ever order.
-  FASTFLOAT_CONSTEXPR20 int compare(const bigint& other) const noexcept {
+  FASTFLOAT_CONSTEXPR20 int compare(const bigint &other) const noexcept {
     if (vec.len() > other.vec.len()) {
       return 1;
     } else if (vec.len() < other.vec.len()) {
@@ -512,12 +526,12 @@ struct bigint : pow5_tables<> {
       return false;
     } else if (!vec.is_empty()) {
       // move limbs
-      limb* dst = vec.data + n;
-      const limb* src = vec.data;
+      limb *dst = vec.data + n;
+      const limb *src = vec.data;
       std::copy_backward(src, src + vec.len(), dst + vec.len());
       // fill in empty limbs
-      limb* first = vec.data;
-      limb* last = first + n;
+      limb *first = vec.data;
+      limb *last = first + n;
       ::std::fill(first, last, 0);
       vec.set_len(n + vec.len());
       return true;
@@ -560,18 +574,12 @@ struct bigint : pow5_tables<> {
     return int(limb_bits * vec.len()) - lz;
   }
 
-  FASTFLOAT_CONSTEXPR20 bool mul(limb y) noexcept {
-    return small_mul(vec, y);
-  }
+  FASTFLOAT_CONSTEXPR20 bool mul(limb y) noexcept { return small_mul(vec, y); }
 
-  FASTFLOAT_CONSTEXPR20 bool add(limb y) noexcept {
-    return small_add(vec, y);
-  }
+  FASTFLOAT_CONSTEXPR20 bool add(limb y) noexcept { return small_add(vec, y); }
 
   // multiply as if by 2 raised to a power.
-  FASTFLOAT_CONSTEXPR20 bool pow2(uint32_t exp) noexcept {
-    return shl(exp);
-  }
+  FASTFLOAT_CONSTEXPR20 bool pow2(uint32_t exp) noexcept { return shl(exp); }
 
   // multiply as if by 5 raised to a power.
   FASTFLOAT_CONSTEXPR20 bool pow5(uint32_t exp) noexcept {
@@ -597,9 +605,8 @@ struct bigint : pow5_tables<> {
       // Work around clang bug https://godbolt.org/z/zedh7rrhc
       // This is similar to https://github.com/llvm/llvm-project/issues/47746,
       // except the workaround described there don't work here
-      FASTFLOAT_TRY(
-        small_mul(vec, limb(((void)small_power_of_5[0], small_power_of_5[exp])))
-      );
+      FASTFLOAT_TRY(small_mul(
+          vec, limb(((void)small_power_of_5[0], small_power_of_5[exp]))));
     }
 
     return true;
diff --git a/third_party/fast_float/constexpr_feature_detect.h b/third_party/fast_float/constexpr_feature_detect.h
index ba8b65c64a16..7624beafcacf 100644
--- a/third_party/fast_float/constexpr_feature_detect.h
+++ b/third_party/fast_float/constexpr_feature_detect.h
@@ -20,16 +20,16 @@
 #define FASTFLOAT_HAS_BIT_CAST 0
 #endif
 
-#if defined(__cpp_lib_is_constant_evaluated) && __cpp_lib_is_constant_evaluated >= 201811L
+#if defined(__cpp_lib_is_constant_evaluated) &&                                \
+    __cpp_lib_is_constant_evaluated >= 201811L
 #define FASTFLOAT_HAS_IS_CONSTANT_EVALUATED 1
 #else
 #define FASTFLOAT_HAS_IS_CONSTANT_EVALUATED 0
 #endif
 
 // Testing for relevant C++20 constexpr library features
-#if FASTFLOAT_HAS_IS_CONSTANT_EVALUATED \
-    && FASTFLOAT_HAS_BIT_CAST \
-    && __cpp_lib_constexpr_algorithms >= 201806L /*For std::copy and std::fill*/
+#if FASTFLOAT_HAS_IS_CONSTANT_EVALUATED && FASTFLOAT_HAS_BIT_CAST &&           \
+    __cpp_lib_constexpr_algorithms >= 201806L /*For std::copy and std::fill*/
 #define FASTFLOAT_CONSTEXPR20 constexpr
 #define FASTFLOAT_IS_CONSTEXPR 1
 #else
@@ -37,4 +37,10 @@
 #define FASTFLOAT_IS_CONSTEXPR 0
 #endif
 
+#if __cplusplus >= 201703L || (defined(_MSVC_LANG) && _MSVC_LANG >= 201703L)
+#define FASTFLOAT_DETAIL_MUST_DEFINE_CONSTEXPR_VARIABLE 0
+#else
+#define FASTFLOAT_DETAIL_MUST_DEFINE_CONSTEXPR_VARIABLE 1
+#endif
+
 #endif // FASTFLOAT_CONSTEXPR_FEATURE_DETECT_H
diff --git a/third_party/fast_float/decimal_to_binary.h b/third_party/fast_float/decimal_to_binary.h
index fec916f3a07b..70ecf73c8ea6 100644
--- a/third_party/fast_float/decimal_to_binary.h
+++ b/third_party/fast_float/decimal_to_binary.h
@@ -12,27 +12,34 @@
 
 namespace fast_float {
 
-// This will compute or rather approximate w * 5**q and return a pair of 64-bit words approximating
-// the result, with the "high" part corresponding to the most significant bits and the
-// low part corresponding to the least significant bits.
+// This will compute or rather approximate w * 5**q and return a pair of 64-bit
+// words approximating the result, with the "high" part corresponding to the
+// most significant bits and the low part corresponding to the least significant
+// bits.
 //
 template <int bit_precision>
-fastfloat_really_inline FASTFLOAT_CONSTEXPR20
-value128 compute_product_approximation(int64_t q, uint64_t w) {
+fastfloat_really_inline FASTFLOAT_CONSTEXPR20 value128
+compute_product_approximation(int64_t q, uint64_t w) {
   const int index = 2 * int(q - powers::smallest_power_of_five);
-  // For small values of q, e.g., q in [0,27], the answer is always exact because
-  // The line value128 firstproduct = full_multiplication(w, power_of_five_128[index]);
-  // gives the exact answer.
-  value128 firstproduct = full_multiplication(w, powers::power_of_five_128[index]);
-  static_assert((bit_precision >= 0) && (bit_precision <= 64), " precision should  be in (0,64]");
-  constexpr uint64_t precision_mask = (bit_precision < 64) ?
-               (uint64_t(0xFFFFFFFFFFFFFFFF) >> bit_precision)
-               : uint64_t(0xFFFFFFFFFFFFFFFF);
-  if((firstproduct.high & precision_mask) == precision_mask) { // could further guard with  (lower + w < lower)
-    // regarding the second product, we only need secondproduct.high, but our expectation is that the compiler will optimize this extra work away if needed.
-    value128 secondproduct = full_multiplication(w, powers::power_of_five_128[index + 1]);
+  // For small values of q, e.g., q in [0,27], the answer is always exact
+  // because The line value128 firstproduct = full_multiplication(w,
+  // power_of_five_128[index]); gives the exact answer.
+  value128 firstproduct =
+      full_multiplication(w, powers::power_of_five_128[index]);
+  static_assert((bit_precision >= 0) && (bit_precision <= 64),
+                " precision should  be in (0,64]");
+  constexpr uint64_t precision_mask =
+      (bit_precision < 64) ? (uint64_t(0xFFFFFFFFFFFFFFFF) >> bit_precision)
+                           : uint64_t(0xFFFFFFFFFFFFFFFF);
+  if ((firstproduct.high & precision_mask) ==
+      precision_mask) { // could further guard with  (lower + w < lower)
+    // regarding the second product, we only need secondproduct.high, but our
+    // expectation is that the compiler will optimize this extra work away if
+    // needed.
+    value128 secondproduct =
+        full_multiplication(w, powers::power_of_five_128[index + 1]);
     firstproduct.low += secondproduct.high;
-    if(secondproduct.high > firstproduct.low) {
+    if (secondproduct.high > firstproduct.low) {
       firstproduct.high++;
     }
   }
@@ -55,43 +62,45 @@ namespace detail {
  * where
  *   p = log(5**-q)/log(2) = -q * log(5)/log(2)
  */
-  constexpr fastfloat_really_inline int32_t power(int32_t q)  noexcept  {
-    return (((152170 + 65536) * q) >> 16) + 63;
-  }
+constexpr fastfloat_really_inline int32_t power(int32_t q) noexcept {
+  return (((152170 + 65536) * q) >> 16) + 63;
+}
 } // namespace detail
 
 // create an adjusted mantissa, biased by the invalid power2
 // for significant digits already multiplied by 10 ** q.
 template <typename binary>
-fastfloat_really_inline FASTFLOAT_CONSTEXPR14
-adjusted_mantissa compute_error_scaled(int64_t q, uint64_t w, int lz) noexcept  {
+fastfloat_really_inline FASTFLOAT_CONSTEXPR14 adjusted_mantissa
+compute_error_scaled(int64_t q, uint64_t w, int lz) noexcept {
   int hilz = int(w >> 63) ^ 1;
   adjusted_mantissa answer;
   answer.mantissa = w << hilz;
   int bias = binary::mantissa_explicit_bits() - binary::minimum_exponent();
-  answer.power2 = int32_t(detail::power(int32_t(q)) + bias - hilz - lz - 62 + invalid_am_bias);
+  answer.power2 = int32_t(detail::power(int32_t(q)) + bias - hilz - lz - 62 +
+                          invalid_am_bias);
   return answer;
 }
 
 // w * 10 ** q, without rounding the representation up.
 // the power2 in the exponent will be adjusted by invalid_am_bias.
 template <typename binary>
-fastfloat_really_inline FASTFLOAT_CONSTEXPR20
-adjusted_mantissa compute_error(int64_t q, uint64_t w)  noexcept  {
+fastfloat_really_inline FASTFLOAT_CONSTEXPR20 adjusted_mantissa
+compute_error(int64_t q, uint64_t w) noexcept {
   int lz = leading_zeroes(w);
   w <<= lz;
-  value128 product = compute_product_approximation<binary::mantissa_explicit_bits() + 3>(q, w);
+  value128 product =
+      compute_product_approximation<binary::mantissa_explicit_bits() + 3>(q, w);
   return compute_error_scaled<binary>(q, product.high, lz);
 }
 
 // w * 10 ** q
-// The returned value should be a valid ieee64 number that simply need to be packed.
-// However, in some very rare cases, the computation will fail. In such cases, we
-// return an adjusted_mantissa with a negative power of 2: the caller should recompute
-// in such cases.
+// The returned value should be a valid ieee64 number that simply need to be
+// packed. However, in some very rare cases, the computation will fail. In such
+// cases, we return an adjusted_mantissa with a negative power of 2: the caller
+// should recompute in such cases.
 template <typename binary>
-fastfloat_really_inline FASTFLOAT_CONSTEXPR20
-adjusted_mantissa compute_float(int64_t q, uint64_t w)  noexcept  {
+fastfloat_really_inline FASTFLOAT_CONSTEXPR20 adjusted_mantissa
+compute_float(int64_t q, uint64_t w) noexcept {
   adjusted_mantissa answer;
   if ((w == 0) || (q < binary::smallest_power_of_ten())) {
     answer.power2 = 0;
@@ -105,7 +114,8 @@ adjusted_mantissa compute_float(int64_t q, uint64_t w)  noexcept  {
     answer.mantissa = 0;
     return answer;
   }
-  // At this point in time q is in [powers::smallest_power_of_five, powers::largest_power_of_five].
+  // At this point in time q is in [powers::smallest_power_of_five,
+  // powers::largest_power_of_five].
 
   // We want the most significant bit of i to be 1. Shift if needed.
   int lz = leading_zeroes(w);
@@ -114,26 +124,32 @@ adjusted_mantissa compute_float(int64_t q, uint64_t w)  noexcept  {
   // The required precision is binary::mantissa_explicit_bits() + 3 because
   // 1. We need the implicit bit
   // 2. We need an extra bit for rounding purposes
-  // 3. We might lose a bit due to the "upperbit" routine (result too small, requiring a shift)
+  // 3. We might lose a bit due to the "upperbit" routine (result too small,
+  // requiring a shift)
 
-  value128 product = compute_product_approximation<binary::mantissa_explicit_bits() + 3>(q, w);
+  value128 product =
+      compute_product_approximation<binary::mantissa_explicit_bits() + 3>(q, w);
   // The computed 'product' is always sufficient.
   // Mathematical proof:
-  // Noble Mushtak and Daniel Lemire, Fast Number Parsing Without Fallback (to appear)
-  // See script/mushtak_lemire.py
+  // Noble Mushtak and Daniel Lemire, Fast Number Parsing Without Fallback (to
+  // appear) See script/mushtak_lemire.py
 
-  // The "compute_product_approximation" function can be slightly slower than a branchless approach:
-  // value128 product = compute_product(q, w);
-  // but in practice, we can win big with the compute_product_approximation if its additional branch
-  // is easily predicted. Which is best is data specific.
+  // The "compute_product_approximation" function can be slightly slower than a
+  // branchless approach: value128 product = compute_product(q, w); but in
+  // practice, we can win big with the compute_product_approximation if its
+  // additional branch is easily predicted. Which is best is data specific.
   int upperbit = int(product.high >> 63);
+  int shift = upperbit + 64 - binary::mantissa_explicit_bits() - 3;
 
-  answer.mantissa = product.high >> (upperbit + 64 - binary::mantissa_explicit_bits() - 3);
+  answer.mantissa = product.high >> shift;
 
-  answer.power2 = int32_t(detail::power(int32_t(q)) + upperbit - lz - binary::minimum_exponent());
+  answer.power2 = int32_t(detail::power(int32_t(q)) + upperbit - lz -
+                          binary::minimum_exponent());
   if (answer.power2 <= 0) { // we have a subnormal?
     // Here have that answer.power2 <= 0 so -answer.power2 >= 0
-    if(-answer.power2 + 1 >= 64) { // if we have more than 64 bits below the minimum exponent, you have a zero for sure.
+    if (-answer.power2 + 1 >=
+        64) { // if we have more than 64 bits below the minimum exponent, you
+              // have a zero for sure.
       answer.power2 = 0;
       answer.mantissa = 0;
       // result should be zero
@@ -152,20 +168,26 @@ adjusted_mantissa compute_float(int64_t q, uint64_t w)  noexcept  {
     // up 0x3fffffffffffff x 2^-1023-53  and once we do, we are no longer
     // subnormal, but we can only know this after rounding.
     // So we only declare a subnormal if we are smaller than the threshold.
-    answer.power2 = (answer.mantissa < (uint64_t(1) << binary::mantissa_explicit_bits())) ? 0 : 1;
+    answer.power2 =
+        (answer.mantissa < (uint64_t(1) << binary::mantissa_explicit_bits()))
+            ? 0
+            : 1;
     return answer;
   }
 
   // usually, we round *up*, but if we fall right in between and and we have an
   // even basis, we need to round down
   // We are only concerned with the cases where 5**q fits in single 64-bit word.
-  if ((product.low <= 1) &&  (q >= binary::min_exponent_round_to_even()) && (q <= binary::max_exponent_round_to_even()) &&
-      ((answer.mantissa & 3) == 1) ) { // we may fall between two floats!
+  if ((product.low <= 1) && (q >= binary::min_exponent_round_to_even()) &&
+      (q <= binary::max_exponent_round_to_even()) &&
+      ((answer.mantissa & 3) == 1)) { // we may fall between two floats!
     // To be in-between two floats we need that in doing
-    //   answer.mantissa = product.high >> (upperbit + 64 - binary::mantissa_explicit_bits() - 3);
-    // ... we dropped out only zeroes. But if this happened, then we can go back!!!
-    if((answer.mantissa  << (upperbit + 64 - binary::mantissa_explicit_bits() - 3)) ==  product.high) {
-      answer.mantissa &= ~uint64_t(1);          // flip it so that we do not round up
+    //   answer.mantissa = product.high >> (upperbit + 64 -
+    //   binary::mantissa_explicit_bits() - 3);
+    // ... we dropped out only zeroes. But if this happened, then we can go
+    // back!!!
+    if ((answer.mantissa << shift) == product.high) {
+      answer.mantissa &= ~uint64_t(1); // flip it so that we do not round up
     }
   }
 
diff --git a/third_party/fast_float/digit_comparison.h b/third_party/fast_float/digit_comparison.h
index 512a27f5a5f4..303fff91eb1f 100644
--- a/third_party/fast_float/digit_comparison.h
+++ b/third_party/fast_float/digit_comparison.h
@@ -13,19 +13,34 @@
 namespace fast_float {
 
 // 1e0 to 1e19
-constexpr static uint64_t powers_of_ten_uint64[] = {
-    1UL, 10UL, 100UL, 1000UL, 10000UL, 100000UL, 1000000UL, 10000000UL, 100000000UL,
-    1000000000UL, 10000000000UL, 100000000000UL, 1000000000000UL, 10000000000000UL,
-    100000000000000UL, 1000000000000000UL, 10000000000000000UL, 100000000000000000UL,
-    1000000000000000000UL, 10000000000000000000UL};
+constexpr static uint64_t powers_of_ten_uint64[] = {1UL,
+                                                    10UL,
+                                                    100UL,
+                                                    1000UL,
+                                                    10000UL,
+                                                    100000UL,
+                                                    1000000UL,
+                                                    10000000UL,
+                                                    100000000UL,
+                                                    1000000000UL,
+                                                    10000000000UL,
+                                                    100000000000UL,
+                                                    1000000000000UL,
+                                                    10000000000000UL,
+                                                    100000000000000UL,
+                                                    1000000000000000UL,
+                                                    10000000000000000UL,
+                                                    100000000000000000UL,
+                                                    1000000000000000000UL,
+                                                    10000000000000000000UL};
 
 // calculate the exponent, in scientific notation, of the number.
 // this algorithm is not even close to optimized, but it has no practical
 // effect on performance: in order to have a faster algorithm, we'd need
 // to slow down performance for faster algorithms, and this is still fast.
 template <typename UC>
-fastfloat_really_inline FASTFLOAT_CONSTEXPR14
-int32_t scientific_exponent(parsed_number_string_t<UC> & num) noexcept {
+fastfloat_really_inline FASTFLOAT_CONSTEXPR14 int32_t
+scientific_exponent(parsed_number_string_t<UC> &num) noexcept {
   uint64_t mantissa = num.mantissa;
   int32_t exponent = int32_t(num.exponent);
   while (mantissa >= 10000) {
@@ -45,15 +60,16 @@ int32_t scientific_exponent(parsed_number_string_t<UC> & num) noexcept {
 
 // this converts a native floating-point number to an extended-precision float.
 template <typename T>
-fastfloat_really_inline FASTFLOAT_CONSTEXPR20
-adjusted_mantissa to_extended(T value) noexcept {
+fastfloat_really_inline FASTFLOAT_CONSTEXPR20 adjusted_mantissa
+to_extended(T value) noexcept {
   using equiv_uint = typename binary_format<T>::equiv_uint;
   constexpr equiv_uint exponent_mask = binary_format<T>::exponent_mask();
   constexpr equiv_uint mantissa_mask = binary_format<T>::mantissa_mask();
   constexpr equiv_uint hidden_bit_mask = binary_format<T>::hidden_bit_mask();
 
   adjusted_mantissa am;
-  int32_t bias = binary_format<T>::mantissa_explicit_bits() - binary_format<T>::minimum_exponent();
+  int32_t bias = binary_format<T>::mantissa_explicit_bits() -
+                 binary_format<T>::minimum_exponent();
   equiv_uint bits;
 #if FASTFLOAT_HAS_BIT_CAST
   bits = std::bit_cast<equiv_uint>(value);
@@ -66,7 +82,8 @@ adjusted_mantissa to_extended(T value) noexcept {
     am.mantissa = bits & mantissa_mask;
   } else {
     // normal
-    am.power2 = int32_t((bits & exponent_mask) >> binary_format<T>::mantissa_explicit_bits());
+    am.power2 = int32_t((bits & exponent_mask) >>
+                        binary_format<T>::mantissa_explicit_bits());
     am.power2 -= bias;
     am.mantissa = (bits & mantissa_mask) | hidden_bit_mask;
   }
@@ -78,8 +95,8 @@ adjusted_mantissa to_extended(T value) noexcept {
 // we are given a native float that represents b, so we need to adjust it
 // halfway between b and b+u.
 template <typename T>
-fastfloat_really_inline FASTFLOAT_CONSTEXPR20
-adjusted_mantissa to_extended_halfway(T value) noexcept {
+fastfloat_really_inline FASTFLOAT_CONSTEXPR20 adjusted_mantissa
+to_extended_halfway(T value) noexcept {
   adjusted_mantissa am = to_extended(value);
   am.mantissa <<= 1;
   am.mantissa += 1;
@@ -89,15 +106,18 @@ adjusted_mantissa to_extended_halfway(T value) noexcept {
 
 // round an extended-precision float to the nearest machine float.
 template <typename T, typename callback>
-fastfloat_really_inline FASTFLOAT_CONSTEXPR14
-void round(adjusted_mantissa& am, callback cb) noexcept {
+fastfloat_really_inline FASTFLOAT_CONSTEXPR14 void round(adjusted_mantissa &am,
+                                                         callback cb) noexcept {
   int32_t mantissa_shift = 64 - binary_format<T>::mantissa_explicit_bits() - 1;
   if (-am.power2 >= mantissa_shift) {
     // have a denormal float
     int32_t shift = -am.power2 + 1;
     cb(am, std::min<int32_t>(shift, 64));
     // check for round-up: if rounding-nearest carried us to the hidden bit.
-    am.power2 = (am.mantissa < (uint64_t(1) << binary_format<T>::mantissa_explicit_bits())) ? 0 : 1;
+    am.power2 = (am.mantissa <
+                 (uint64_t(1) << binary_format<T>::mantissa_explicit_bits()))
+                    ? 0
+                    : 1;
     return;
   }
 
@@ -105,7 +125,8 @@ void round(adjusted_mantissa& am, callback cb) noexcept {
   cb(am, mantissa_shift);
 
   // check for carry
-  if (am.mantissa >= (uint64_t(2) << binary_format<T>::mantissa_explicit_bits())) {
+  if (am.mantissa >=
+      (uint64_t(2) << binary_format<T>::mantissa_explicit_bits())) {
     am.mantissa = (uint64_t(1) << binary_format<T>::mantissa_explicit_bits());
     am.power2++;
   }
@@ -119,16 +140,11 @@ void round(adjusted_mantissa& am, callback cb) noexcept {
 }
 
 template <typename callback>
-fastfloat_really_inline FASTFLOAT_CONSTEXPR14
-void round_nearest_tie_even(adjusted_mantissa& am, int32_t shift, callback cb) noexcept {
-  const uint64_t mask
-  = (shift == 64)
-    ? UINT64_MAX
-    : (uint64_t(1) << shift) - 1;
-  const uint64_t halfway
-  = (shift == 0)
-    ? 0
-    : uint64_t(1) << (shift - 1);
+fastfloat_really_inline FASTFLOAT_CONSTEXPR14 void
+round_nearest_tie_even(adjusted_mantissa &am, int32_t shift,
+                       callback cb) noexcept {
+  const uint64_t mask = (shift == 64) ? UINT64_MAX : (uint64_t(1) << shift) - 1;
+  const uint64_t halfway = (shift == 0) ? 0 : uint64_t(1) << (shift - 1);
   uint64_t truncated_bits = am.mantissa & mask;
   bool is_above = truncated_bits > halfway;
   bool is_halfway = truncated_bits == halfway;
@@ -145,8 +161,8 @@ void round_nearest_tie_even(adjusted_mantissa& am, int32_t shift, callback cb) n
   am.mantissa += uint64_t(cb(is_odd, is_halfway, is_above));
 }
 
-fastfloat_really_inline FASTFLOAT_CONSTEXPR14
-void round_down(adjusted_mantissa& am, int32_t shift) noexcept {
+fastfloat_really_inline FASTFLOAT_CONSTEXPR14 void
+round_down(adjusted_mantissa &am, int32_t shift) noexcept {
   if (shift == 64) {
     am.mantissa = 0;
   } else {
@@ -155,10 +171,11 @@ void round_down(adjusted_mantissa& am, int32_t shift) noexcept {
   am.power2 += shift;
 }
 template <typename UC>
-fastfloat_really_inline FASTFLOAT_CONSTEXPR20
-void skip_zeros(UC const * & first, UC const * last) noexcept {
+fastfloat_really_inline FASTFLOAT_CONSTEXPR20 void
+skip_zeros(UC const *&first, UC const *last) noexcept {
   uint64_t val;
-  while (!cpp20_and_in_constexpr() && std::distance(first, last) >= int_cmp_len<UC>()) {
+  while (!cpp20_and_in_constexpr() &&
+         std::distance(first, last) >= int_cmp_len<UC>()) {
     ::memcpy(&val, first, sizeof(uint64_t));
     if (val != int_cmp_zeros<UC>()) {
       break;
@@ -176,11 +193,12 @@ void skip_zeros(UC const * & first, UC const * last) noexcept {
 // determine if any non-zero digits were truncated.
 // all characters must be valid digits.
 template <typename UC>
-fastfloat_really_inline FASTFLOAT_CONSTEXPR20
-bool is_truncated(UC const * first, UC const * last) noexcept {
+fastfloat_really_inline FASTFLOAT_CONSTEXPR20 bool
+is_truncated(UC const *first, UC const *last) noexcept {
   // do 8-bit optimizations, can just compare to 8 literal 0s.
   uint64_t val;
-  while (!cpp20_and_in_constexpr() && std::distance(first, last) >= int_cmp_len<UC>()) {
+  while (!cpp20_and_in_constexpr() &&
+         std::distance(first, last) >= int_cmp_len<UC>()) {
     ::memcpy(&val, first, sizeof(uint64_t));
     if (val != int_cmp_zeros<UC>()) {
       return true;
@@ -196,15 +214,15 @@ bool is_truncated(UC const * first, UC const * last) noexcept {
   return false;
 }
 template <typename UC>
-fastfloat_really_inline FASTFLOAT_CONSTEXPR20
-bool is_truncated(span<const UC> s) noexcept {
+fastfloat_really_inline FASTFLOAT_CONSTEXPR20 bool
+is_truncated(span<const UC> s) noexcept {
   return is_truncated(s.ptr, s.ptr + s.len());
 }
 
-
 template <typename UC>
-fastfloat_really_inline FASTFLOAT_CONSTEXPR20
-void parse_eight_digits(const UC*& p, limb& value, size_t& counter, size_t& count) noexcept {
+fastfloat_really_inline FASTFLOAT_CONSTEXPR20 void
+parse_eight_digits(const UC *&p, limb &value, size_t &counter,
+                   size_t &count) noexcept {
   value = value * 100000000 + parse_eight_digits_unrolled(p);
   p += 8;
   counter += 8;
@@ -212,22 +230,23 @@ void parse_eight_digits(const UC*& p, limb& value, size_t& counter, size_t& coun
 }
 
 template <typename UC>
-fastfloat_really_inline FASTFLOAT_CONSTEXPR14
-void parse_one_digit(UC const *& p, limb& value, size_t& counter, size_t& count) noexcept {
+fastfloat_really_inline FASTFLOAT_CONSTEXPR14 void
+parse_one_digit(UC const *&p, limb &value, size_t &counter,
+                size_t &count) noexcept {
   value = value * 10 + limb(*p - UC('0'));
   p++;
   counter++;
   count++;
 }
 
-fastfloat_really_inline FASTFLOAT_CONSTEXPR20
-void add_native(bigint& big, limb power, limb value) noexcept {
+fastfloat_really_inline FASTFLOAT_CONSTEXPR20 void
+add_native(bigint &big, limb power, limb value) noexcept {
   big.mul(power);
   big.add(value);
 }
 
-fastfloat_really_inline FASTFLOAT_CONSTEXPR20
-void round_up_bigint(bigint& big, size_t& count) noexcept {
+fastfloat_really_inline FASTFLOAT_CONSTEXPR20 void
+round_up_bigint(bigint &big, size_t &count) noexcept {
   // need to round-up the digits, but need to avoid rounding
   // ....9999 to ...10000, which could cause a false halfway point.
   add_native(big, 10, 1);
@@ -236,8 +255,9 @@ void round_up_bigint(bigint& big, size_t& count) noexcept {
 
 // parse the significant digits into a big integer
 template <typename UC>
-inline FASTFLOAT_CONSTEXPR20
-void parse_mantissa(bigint& result, parsed_number_string_t<UC>& num, size_t max_digits, size_t& digits) noexcept {
+inline FASTFLOAT_CONSTEXPR20 void
+parse_mantissa(bigint &result, parsed_number_string_t<UC> &num,
+               size_t max_digits, size_t &digits) noexcept {
   // try to minimize the number of big integer and scalar multiplication.
   // therefore, try to parse 8 digits at a time, and multiply by the largest
   // scalar value (9 or 19 digits) for each step.
@@ -251,12 +271,13 @@ void parse_mantissa(bigint& result, parsed_number_string_t<UC>& num, size_t max_
 #endif
 
   // process all integer digits.
-  UC const * p = num.integer.ptr;
-  UC const * pend = p + num.integer.len();
+  UC const *p = num.integer.ptr;
+  UC const *pend = p + num.integer.len();
   skip_zeros(p, pend);
   // process all digits, in increments of step per loop
   while (p != pend) {
-    while ((std::distance(p, pend) >= 8) && (step - counter >= 8) && (max_digits - digits >= 8)) {
+    while ((std::distance(p, pend) >= 8) && (step - counter >= 8) &&
+           (max_digits - digits >= 8)) {
       parse_eight_digits(p, value, counter, digits);
     }
     while (counter < step && p != pend && digits < max_digits) {
@@ -289,7 +310,8 @@ void parse_mantissa(bigint& result, parsed_number_string_t<UC>& num, size_t max_
     }
     // process all digits, in increments of step per loop
     while (p != pend) {
-      while ((std::distance(p, pend) >= 8) && (step - counter >= 8) && (max_digits - digits >= 8)) {
+      while ((std::distance(p, pend) >= 8) && (step - counter >= 8) &&
+             (max_digits - digits >= 8)) {
         parse_eight_digits(p, value, counter, digits);
       }
       while (counter < step && p != pend && digits < max_digits) {
@@ -317,19 +339,23 @@ void parse_mantissa(bigint& result, parsed_number_string_t<UC>& num, size_t max_
 }
 
 template <typename T>
-inline FASTFLOAT_CONSTEXPR20
-adjusted_mantissa positive_digit_comp(bigint& bigmant, int32_t exponent) noexcept {
+inline FASTFLOAT_CONSTEXPR20 adjusted_mantissa
+positive_digit_comp(bigint &bigmant, int32_t exponent) noexcept {
   FASTFLOAT_ASSERT(bigmant.pow10(uint32_t(exponent)));
   adjusted_mantissa answer;
   bool truncated;
   answer.mantissa = bigmant.hi64(truncated);
-  int bias = binary_format<T>::mantissa_explicit_bits() - binary_format<T>::minimum_exponent();
+  int bias = binary_format<T>::mantissa_explicit_bits() -
+             binary_format<T>::minimum_exponent();
   answer.power2 = bigmant.bit_length() - 64 + bias;
 
-  round<T>(answer, [truncated](adjusted_mantissa& a, int32_t shift) {
-    round_nearest_tie_even(a, shift, [truncated](bool is_odd, bool is_halfway, bool is_above) -> bool {
-      return is_above || (is_halfway && truncated) || (is_odd && is_halfway);
-    });
+  round<T>(answer, [truncated](adjusted_mantissa &a, int32_t shift) {
+    round_nearest_tie_even(
+        a, shift,
+        [truncated](bool is_odd, bool is_halfway, bool is_above) -> bool {
+          return is_above || (is_halfway && truncated) ||
+                 (is_odd && is_halfway);
+        });
   });
 
   return answer;
@@ -341,15 +367,17 @@ adjusted_mantissa positive_digit_comp(bigint& bigmant, int32_t exponent) noexcep
 // we then need to scale by `2^(f- e)`, and then the two significant digits
 // are of the same magnitude.
 template <typename T>
-inline FASTFLOAT_CONSTEXPR20
-adjusted_mantissa negative_digit_comp(bigint& bigmant, adjusted_mantissa am, int32_t exponent) noexcept {
-  bigint& real_digits = bigmant;
+inline FASTFLOAT_CONSTEXPR20 adjusted_mantissa negative_digit_comp(
+    bigint &bigmant, adjusted_mantissa am, int32_t exponent) noexcept {
+  bigint &real_digits = bigmant;
   int32_t real_exp = exponent;
 
   // get the value of `b`, rounded down, and get a bigint representation of b+h
   adjusted_mantissa am_b = am;
-  // gcc7 buf: use a lambda to remove the noexcept qualifier bug with -Wnoexcept-type.
-  round<T>(am_b, [](adjusted_mantissa&a, int32_t shift) { round_down(a, shift); });
+  // gcc7 buf: use a lambda to remove the noexcept qualifier bug with
+  // -Wnoexcept-type.
+  round<T>(am_b,
+           [](adjusted_mantissa &a, int32_t shift) { round_down(a, shift); });
   T b;
   to_float(false, am_b, b);
   adjusted_mantissa theor = to_extended_halfway(b);
@@ -371,18 +399,19 @@ adjusted_mantissa negative_digit_comp(bigint& bigmant, adjusted_mantissa am, int
   // compare digits, and use it to director rounding
   int ord = real_digits.compare(theor_digits);
   adjusted_mantissa answer = am;
-  round<T>(answer, [ord](adjusted_mantissa& a, int32_t shift) {
-    round_nearest_tie_even(a, shift, [ord](bool is_odd, bool _, bool __) -> bool {
-      (void)_;  // not needed, since we've done our comparison
-      (void)__; // not needed, since we've done our comparison
-      if (ord > 0) {
-        return true;
-      } else if (ord < 0) {
-        return false;
-      } else {
-        return is_odd;
-      }
-    });
+  round<T>(answer, [ord](adjusted_mantissa &a, int32_t shift) {
+    round_nearest_tie_even(
+        a, shift, [ord](bool is_odd, bool _, bool __) -> bool {
+          (void)_;  // not needed, since we've done our comparison
+          (void)__; // not needed, since we've done our comparison
+          if (ord > 0) {
+            return true;
+          } else if (ord < 0) {
+            return false;
+          } else {
+            return is_odd;
+          }
+        });
   });
 
   return answer;
@@ -402,8 +431,8 @@ adjusted_mantissa negative_digit_comp(bigint& bigmant, adjusted_mantissa am, int
 // the actual digits. we then compare the big integer representations
 // of both, and use that to direct rounding.
 template <typename T, typename UC>
-inline FASTFLOAT_CONSTEXPR20
-adjusted_mantissa digit_comp(parsed_number_string_t<UC>& num, adjusted_mantissa am) noexcept {
+inline FASTFLOAT_CONSTEXPR20 adjusted_mantissa
+digit_comp(parsed_number_string_t<UC> &num, adjusted_mantissa am) noexcept {
   // remove the invalid exponent bias
   am.power2 -= invalid_am_bias;
 
diff --git a/third_party/fast_float/fast_float.h b/third_party/fast_float/fast_float.h
index 04efa877ee7b..42a3cdfaf4c4 100644
--- a/third_party/fast_float/fast_float.h
+++ b/third_party/fast_float/fast_float.h
@@ -6,36 +6,50 @@
 
 namespace fast_float {
 /**
- * This function parses the character sequence [first,last) for a number. It parses floating-point numbers expecting
- * a locale-indepent format equivalent to what is used by std::strtod in the default ("C") locale.
- * The resulting floating-point value is the closest floating-point values (using either float or double),
- * using the "round to even" convention for values that would otherwise fall right in-between two values.
- * That is, we provide exact parsing according to the IEEE standard.
+ * This function parses the character sequence [first,last) for a number. It
+ * parses floating-point numbers expecting a locale-indepent format equivalent
+ * to what is used by std::strtod in the default ("C") locale. The resulting
+ * floating-point value is the closest floating-point values (using either float
+ * or double), using the "round to even" convention for values that would
+ * otherwise fall right in-between two values. That is, we provide exact parsing
+ * according to the IEEE standard.
  *
- * Given a successful parse, the pointer (`ptr`) in the returned value is set to point right after the
- * parsed number, and the `value` referenced is set to the parsed value. In case of error, the returned
- * `ec` contains a representative error, otherwise the default (`std::errc()`) value is stored.
+ * Given a successful parse, the pointer (`ptr`) in the returned value is set to
+ * point right after the parsed number, and the `value` referenced is set to the
+ * parsed value. In case of error, the returned `ec` contains a representative
+ * error, otherwise the default (`std::errc()`) value is stored.
  *
- * The implementation does not throw and does not allocate memory (e.g., with `new` or `malloc`).
+ * The implementation does not throw and does not allocate memory (e.g., with
+ * `new` or `malloc`).
  *
- * Like the C++17 standard, the `fast_float::from_chars` functions take an optional last argument of
- * the type `fast_float::chars_format`. It is a bitset value: we check whether
- * `fmt & fast_float::chars_format::fixed` and `fmt & fast_float::chars_format::scientific` are set
- * to determine whether we allow the fixed point and scientific notation respectively.
- * The default is  `fast_float::chars_format::general` which allows both `fixed` and `scientific`.
+ * Like the C++17 standard, the `fast_float::from_chars` functions take an
+ * optional last argument of the type `fast_float::chars_format`. It is a bitset
+ * value: we check whether `fmt & fast_float::chars_format::fixed` and `fmt &
+ * fast_float::chars_format::scientific` are set to determine whether we allow
+ * the fixed point and scientific notation respectively. The default is
+ * `fast_float::chars_format::general` which allows both `fixed` and
+ * `scientific`.
  */
-template<typename T, typename UC = char>
-FASTFLOAT_CONSTEXPR20
-from_chars_result_t<UC> from_chars(UC const * first, UC const * last,
-                             T &value, chars_format fmt = chars_format::general)  noexcept;
+template <typename T, typename UC = char,
+          typename = FASTFLOAT_ENABLE_IF(is_supported_float_type<T>())>
+FASTFLOAT_CONSTEXPR20 from_chars_result_t<UC>
+from_chars(UC const *first, UC const *last, T &value,
+           chars_format fmt = chars_format::general) noexcept;
 
 /**
  * Like from_chars, but accepts an `options` argument to govern number parsing.
  */
-template<typename T, typename UC = char>
-FASTFLOAT_CONSTEXPR20
-from_chars_result_t<UC> from_chars_advanced(UC const * first, UC const * last,
-                                      T &value, parse_options_t<UC> options)  noexcept;
+template <typename T, typename UC = char>
+FASTFLOAT_CONSTEXPR20 from_chars_result_t<UC>
+from_chars_advanced(UC const *first, UC const *last, T &value,
+                    parse_options_t<UC> options) noexcept;
+/**
+ * from_chars for integer types.
+ */
+template <typename T, typename UC = char,
+          typename = FASTFLOAT_ENABLE_IF(!is_supported_float_type<T>())>
+FASTFLOAT_CONSTEXPR20 from_chars_result_t<UC>
+from_chars(UC const *first, UC const *last, T &value, int base = 10) noexcept;
 
 } // namespace fast_float
 #include "parse_number.h"
diff --git a/third_party/fast_float/fast_table.h b/third_party/fast_float/fast_table.h
index d8dc5690517c..69f9b2c9245f 100644
--- a/third_party/fast_float/fast_table.h
+++ b/third_party/fast_float/fast_table.h
@@ -29,669 +29,677 @@ namespace fast_float {
  * infinite in binary64 so we never need to worry about powers
  * of 5 greater than 308.
  */
-template <class unused = void>
-struct powers_template {
+template <class unused = void> struct powers_template {
 
-constexpr static int smallest_power_of_five = binary_format<double>::smallest_power_of_ten();
-constexpr static int largest_power_of_five = binary_format<double>::largest_power_of_ten();
-constexpr static int number_of_entries = 2 * (largest_power_of_five - smallest_power_of_five + 1);
-// Powers of five from 5^-342 all the way to 5^308 rounded toward one.
-constexpr static uint64_t power_of_five_128[number_of_entries] = {
-    0xeef453d6923bd65a,0x113faa2906a13b3f,
-    0x9558b4661b6565f8,0x4ac7ca59a424c507,
-    0xbaaee17fa23ebf76,0x5d79bcf00d2df649,
-    0xe95a99df8ace6f53,0xf4d82c2c107973dc,
-    0x91d8a02bb6c10594,0x79071b9b8a4be869,
-    0xb64ec836a47146f9,0x9748e2826cdee284,
-    0xe3e27a444d8d98b7,0xfd1b1b2308169b25,
-    0x8e6d8c6ab0787f72,0xfe30f0f5e50e20f7,
-    0xb208ef855c969f4f,0xbdbd2d335e51a935,
-    0xde8b2b66b3bc4723,0xad2c788035e61382,
-    0x8b16fb203055ac76,0x4c3bcb5021afcc31,
-    0xaddcb9e83c6b1793,0xdf4abe242a1bbf3d,
-    0xd953e8624b85dd78,0xd71d6dad34a2af0d,
-    0x87d4713d6f33aa6b,0x8672648c40e5ad68,
-    0xa9c98d8ccb009506,0x680efdaf511f18c2,
-    0xd43bf0effdc0ba48,0x212bd1b2566def2,
-    0x84a57695fe98746d,0x14bb630f7604b57,
-    0xa5ced43b7e3e9188,0x419ea3bd35385e2d,
-    0xcf42894a5dce35ea,0x52064cac828675b9,
-    0x818995ce7aa0e1b2,0x7343efebd1940993,
-    0xa1ebfb4219491a1f,0x1014ebe6c5f90bf8,
-    0xca66fa129f9b60a6,0xd41a26e077774ef6,
-    0xfd00b897478238d0,0x8920b098955522b4,
-    0x9e20735e8cb16382,0x55b46e5f5d5535b0,
-    0xc5a890362fddbc62,0xeb2189f734aa831d,
-    0xf712b443bbd52b7b,0xa5e9ec7501d523e4,
-    0x9a6bb0aa55653b2d,0x47b233c92125366e,
-    0xc1069cd4eabe89f8,0x999ec0bb696e840a,
-    0xf148440a256e2c76,0xc00670ea43ca250d,
-    0x96cd2a865764dbca,0x380406926a5e5728,
-    0xbc807527ed3e12bc,0xc605083704f5ecf2,
-    0xeba09271e88d976b,0xf7864a44c633682e,
-    0x93445b8731587ea3,0x7ab3ee6afbe0211d,
-    0xb8157268fdae9e4c,0x5960ea05bad82964,
-    0xe61acf033d1a45df,0x6fb92487298e33bd,
-    0x8fd0c16206306bab,0xa5d3b6d479f8e056,
-    0xb3c4f1ba87bc8696,0x8f48a4899877186c,
-    0xe0b62e2929aba83c,0x331acdabfe94de87,
-    0x8c71dcd9ba0b4925,0x9ff0c08b7f1d0b14,
-    0xaf8e5410288e1b6f,0x7ecf0ae5ee44dd9,
-    0xdb71e91432b1a24a,0xc9e82cd9f69d6150,
-    0x892731ac9faf056e,0xbe311c083a225cd2,
-    0xab70fe17c79ac6ca,0x6dbd630a48aaf406,
-    0xd64d3d9db981787d,0x92cbbccdad5b108,
-    0x85f0468293f0eb4e,0x25bbf56008c58ea5,
-    0xa76c582338ed2621,0xaf2af2b80af6f24e,
-    0xd1476e2c07286faa,0x1af5af660db4aee1,
-    0x82cca4db847945ca,0x50d98d9fc890ed4d,
-    0xa37fce126597973c,0xe50ff107bab528a0,
-    0xcc5fc196fefd7d0c,0x1e53ed49a96272c8,
-    0xff77b1fcbebcdc4f,0x25e8e89c13bb0f7a,
-    0x9faacf3df73609b1,0x77b191618c54e9ac,
-    0xc795830d75038c1d,0xd59df5b9ef6a2417,
-    0xf97ae3d0d2446f25,0x4b0573286b44ad1d,
-    0x9becce62836ac577,0x4ee367f9430aec32,
-    0xc2e801fb244576d5,0x229c41f793cda73f,
-    0xf3a20279ed56d48a,0x6b43527578c1110f,
-    0x9845418c345644d6,0x830a13896b78aaa9,
-    0xbe5691ef416bd60c,0x23cc986bc656d553,
-    0xedec366b11c6cb8f,0x2cbfbe86b7ec8aa8,
-    0x94b3a202eb1c3f39,0x7bf7d71432f3d6a9,
-    0xb9e08a83a5e34f07,0xdaf5ccd93fb0cc53,
-    0xe858ad248f5c22c9,0xd1b3400f8f9cff68,
-    0x91376c36d99995be,0x23100809b9c21fa1,
-    0xb58547448ffffb2d,0xabd40a0c2832a78a,
-    0xe2e69915b3fff9f9,0x16c90c8f323f516c,
-    0x8dd01fad907ffc3b,0xae3da7d97f6792e3,
-    0xb1442798f49ffb4a,0x99cd11cfdf41779c,
-    0xdd95317f31c7fa1d,0x40405643d711d583,
-    0x8a7d3eef7f1cfc52,0x482835ea666b2572,
-    0xad1c8eab5ee43b66,0xda3243650005eecf,
-    0xd863b256369d4a40,0x90bed43e40076a82,
-    0x873e4f75e2224e68,0x5a7744a6e804a291,
-    0xa90de3535aaae202,0x711515d0a205cb36,
-    0xd3515c2831559a83,0xd5a5b44ca873e03,
-    0x8412d9991ed58091,0xe858790afe9486c2,
-    0xa5178fff668ae0b6,0x626e974dbe39a872,
-    0xce5d73ff402d98e3,0xfb0a3d212dc8128f,
-    0x80fa687f881c7f8e,0x7ce66634bc9d0b99,
-    0xa139029f6a239f72,0x1c1fffc1ebc44e80,
-    0xc987434744ac874e,0xa327ffb266b56220,
-    0xfbe9141915d7a922,0x4bf1ff9f0062baa8,
-    0x9d71ac8fada6c9b5,0x6f773fc3603db4a9,
-    0xc4ce17b399107c22,0xcb550fb4384d21d3,
-    0xf6019da07f549b2b,0x7e2a53a146606a48,
-    0x99c102844f94e0fb,0x2eda7444cbfc426d,
-    0xc0314325637a1939,0xfa911155fefb5308,
-    0xf03d93eebc589f88,0x793555ab7eba27ca,
-    0x96267c7535b763b5,0x4bc1558b2f3458de,
-    0xbbb01b9283253ca2,0x9eb1aaedfb016f16,
-    0xea9c227723ee8bcb,0x465e15a979c1cadc,
-    0x92a1958a7675175f,0xbfacd89ec191ec9,
-    0xb749faed14125d36,0xcef980ec671f667b,
-    0xe51c79a85916f484,0x82b7e12780e7401a,
-    0x8f31cc0937ae58d2,0xd1b2ecb8b0908810,
-    0xb2fe3f0b8599ef07,0x861fa7e6dcb4aa15,
-    0xdfbdcece67006ac9,0x67a791e093e1d49a,
-    0x8bd6a141006042bd,0xe0c8bb2c5c6d24e0,
-    0xaecc49914078536d,0x58fae9f773886e18,
-    0xda7f5bf590966848,0xaf39a475506a899e,
-    0x888f99797a5e012d,0x6d8406c952429603,
-    0xaab37fd7d8f58178,0xc8e5087ba6d33b83,
-    0xd5605fcdcf32e1d6,0xfb1e4a9a90880a64,
-    0x855c3be0a17fcd26,0x5cf2eea09a55067f,
-    0xa6b34ad8c9dfc06f,0xf42faa48c0ea481e,
-    0xd0601d8efc57b08b,0xf13b94daf124da26,
-    0x823c12795db6ce57,0x76c53d08d6b70858,
-    0xa2cb1717b52481ed,0x54768c4b0c64ca6e,
-    0xcb7ddcdda26da268,0xa9942f5dcf7dfd09,
-    0xfe5d54150b090b02,0xd3f93b35435d7c4c,
-    0x9efa548d26e5a6e1,0xc47bc5014a1a6daf,
-    0xc6b8e9b0709f109a,0x359ab6419ca1091b,
-    0xf867241c8cc6d4c0,0xc30163d203c94b62,
-    0x9b407691d7fc44f8,0x79e0de63425dcf1d,
-    0xc21094364dfb5636,0x985915fc12f542e4,
-    0xf294b943e17a2bc4,0x3e6f5b7b17b2939d,
-    0x979cf3ca6cec5b5a,0xa705992ceecf9c42,
-    0xbd8430bd08277231,0x50c6ff782a838353,
-    0xece53cec4a314ebd,0xa4f8bf5635246428,
-    0x940f4613ae5ed136,0x871b7795e136be99,
-    0xb913179899f68584,0x28e2557b59846e3f,
-    0xe757dd7ec07426e5,0x331aeada2fe589cf,
-    0x9096ea6f3848984f,0x3ff0d2c85def7621,
-    0xb4bca50b065abe63,0xfed077a756b53a9,
-    0xe1ebce4dc7f16dfb,0xd3e8495912c62894,
-    0x8d3360f09cf6e4bd,0x64712dd7abbbd95c,
-    0xb080392cc4349dec,0xbd8d794d96aacfb3,
-    0xdca04777f541c567,0xecf0d7a0fc5583a0,
-    0x89e42caaf9491b60,0xf41686c49db57244,
-    0xac5d37d5b79b6239,0x311c2875c522ced5,
-    0xd77485cb25823ac7,0x7d633293366b828b,
-    0x86a8d39ef77164bc,0xae5dff9c02033197,
-    0xa8530886b54dbdeb,0xd9f57f830283fdfc,
-    0xd267caa862a12d66,0xd072df63c324fd7b,
-    0x8380dea93da4bc60,0x4247cb9e59f71e6d,
-    0xa46116538d0deb78,0x52d9be85f074e608,
-    0xcd795be870516656,0x67902e276c921f8b,
-    0x806bd9714632dff6,0xba1cd8a3db53b6,
-    0xa086cfcd97bf97f3,0x80e8a40eccd228a4,
-    0xc8a883c0fdaf7df0,0x6122cd128006b2cd,
-    0xfad2a4b13d1b5d6c,0x796b805720085f81,
-    0x9cc3a6eec6311a63,0xcbe3303674053bb0,
-    0xc3f490aa77bd60fc,0xbedbfc4411068a9c,
-    0xf4f1b4d515acb93b,0xee92fb5515482d44,
-    0x991711052d8bf3c5,0x751bdd152d4d1c4a,
-    0xbf5cd54678eef0b6,0xd262d45a78a0635d,
-    0xef340a98172aace4,0x86fb897116c87c34,
-    0x9580869f0e7aac0e,0xd45d35e6ae3d4da0,
-    0xbae0a846d2195712,0x8974836059cca109,
-    0xe998d258869facd7,0x2bd1a438703fc94b,
-    0x91ff83775423cc06,0x7b6306a34627ddcf,
-    0xb67f6455292cbf08,0x1a3bc84c17b1d542,
-    0xe41f3d6a7377eeca,0x20caba5f1d9e4a93,
-    0x8e938662882af53e,0x547eb47b7282ee9c,
-    0xb23867fb2a35b28d,0xe99e619a4f23aa43,
-    0xdec681f9f4c31f31,0x6405fa00e2ec94d4,
-    0x8b3c113c38f9f37e,0xde83bc408dd3dd04,
-    0xae0b158b4738705e,0x9624ab50b148d445,
-    0xd98ddaee19068c76,0x3badd624dd9b0957,
-    0x87f8a8d4cfa417c9,0xe54ca5d70a80e5d6,
-    0xa9f6d30a038d1dbc,0x5e9fcf4ccd211f4c,
-    0xd47487cc8470652b,0x7647c3200069671f,
-    0x84c8d4dfd2c63f3b,0x29ecd9f40041e073,
-    0xa5fb0a17c777cf09,0xf468107100525890,
-    0xcf79cc9db955c2cc,0x7182148d4066eeb4,
-    0x81ac1fe293d599bf,0xc6f14cd848405530,
-    0xa21727db38cb002f,0xb8ada00e5a506a7c,
-    0xca9cf1d206fdc03b,0xa6d90811f0e4851c,
-    0xfd442e4688bd304a,0x908f4a166d1da663,
-    0x9e4a9cec15763e2e,0x9a598e4e043287fe,
-    0xc5dd44271ad3cdba,0x40eff1e1853f29fd,
-    0xf7549530e188c128,0xd12bee59e68ef47c,
-    0x9a94dd3e8cf578b9,0x82bb74f8301958ce,
-    0xc13a148e3032d6e7,0xe36a52363c1faf01,
-    0xf18899b1bc3f8ca1,0xdc44e6c3cb279ac1,
-    0x96f5600f15a7b7e5,0x29ab103a5ef8c0b9,
-    0xbcb2b812db11a5de,0x7415d448f6b6f0e7,
-    0xebdf661791d60f56,0x111b495b3464ad21,
-    0x936b9fcebb25c995,0xcab10dd900beec34,
-    0xb84687c269ef3bfb,0x3d5d514f40eea742,
-    0xe65829b3046b0afa,0xcb4a5a3112a5112,
-    0x8ff71a0fe2c2e6dc,0x47f0e785eaba72ab,
-    0xb3f4e093db73a093,0x59ed216765690f56,
-    0xe0f218b8d25088b8,0x306869c13ec3532c,
-    0x8c974f7383725573,0x1e414218c73a13fb,
-    0xafbd2350644eeacf,0xe5d1929ef90898fa,
-    0xdbac6c247d62a583,0xdf45f746b74abf39,
-    0x894bc396ce5da772,0x6b8bba8c328eb783,
-    0xab9eb47c81f5114f,0x66ea92f3f326564,
-    0xd686619ba27255a2,0xc80a537b0efefebd,
-    0x8613fd0145877585,0xbd06742ce95f5f36,
-    0xa798fc4196e952e7,0x2c48113823b73704,
-    0xd17f3b51fca3a7a0,0xf75a15862ca504c5,
-    0x82ef85133de648c4,0x9a984d73dbe722fb,
-    0xa3ab66580d5fdaf5,0xc13e60d0d2e0ebba,
-    0xcc963fee10b7d1b3,0x318df905079926a8,
-    0xffbbcfe994e5c61f,0xfdf17746497f7052,
-    0x9fd561f1fd0f9bd3,0xfeb6ea8bedefa633,
-    0xc7caba6e7c5382c8,0xfe64a52ee96b8fc0,
-    0xf9bd690a1b68637b,0x3dfdce7aa3c673b0,
-    0x9c1661a651213e2d,0x6bea10ca65c084e,
-    0xc31bfa0fe5698db8,0x486e494fcff30a62,
-    0xf3e2f893dec3f126,0x5a89dba3c3efccfa,
-    0x986ddb5c6b3a76b7,0xf89629465a75e01c,
-    0xbe89523386091465,0xf6bbb397f1135823,
-    0xee2ba6c0678b597f,0x746aa07ded582e2c,
-    0x94db483840b717ef,0xa8c2a44eb4571cdc,
-    0xba121a4650e4ddeb,0x92f34d62616ce413,
-    0xe896a0d7e51e1566,0x77b020baf9c81d17,
-    0x915e2486ef32cd60,0xace1474dc1d122e,
-    0xb5b5ada8aaff80b8,0xd819992132456ba,
-    0xe3231912d5bf60e6,0x10e1fff697ed6c69,
-    0x8df5efabc5979c8f,0xca8d3ffa1ef463c1,
-    0xb1736b96b6fd83b3,0xbd308ff8a6b17cb2,
-    0xddd0467c64bce4a0,0xac7cb3f6d05ddbde,
-    0x8aa22c0dbef60ee4,0x6bcdf07a423aa96b,
-    0xad4ab7112eb3929d,0x86c16c98d2c953c6,
-    0xd89d64d57a607744,0xe871c7bf077ba8b7,
-    0x87625f056c7c4a8b,0x11471cd764ad4972,
-    0xa93af6c6c79b5d2d,0xd598e40d3dd89bcf,
-    0xd389b47879823479,0x4aff1d108d4ec2c3,
-    0x843610cb4bf160cb,0xcedf722a585139ba,
-    0xa54394fe1eedb8fe,0xc2974eb4ee658828,
-    0xce947a3da6a9273e,0x733d226229feea32,
-    0x811ccc668829b887,0x806357d5a3f525f,
-    0xa163ff802a3426a8,0xca07c2dcb0cf26f7,
-    0xc9bcff6034c13052,0xfc89b393dd02f0b5,
-    0xfc2c3f3841f17c67,0xbbac2078d443ace2,
-    0x9d9ba7832936edc0,0xd54b944b84aa4c0d,
-    0xc5029163f384a931,0xa9e795e65d4df11,
-    0xf64335bcf065d37d,0x4d4617b5ff4a16d5,
-    0x99ea0196163fa42e,0x504bced1bf8e4e45,
-    0xc06481fb9bcf8d39,0xe45ec2862f71e1d6,
-    0xf07da27a82c37088,0x5d767327bb4e5a4c,
-    0x964e858c91ba2655,0x3a6a07f8d510f86f,
-    0xbbe226efb628afea,0x890489f70a55368b,
-    0xeadab0aba3b2dbe5,0x2b45ac74ccea842e,
-    0x92c8ae6b464fc96f,0x3b0b8bc90012929d,
-    0xb77ada0617e3bbcb,0x9ce6ebb40173744,
-    0xe55990879ddcaabd,0xcc420a6a101d0515,
-    0x8f57fa54c2a9eab6,0x9fa946824a12232d,
-    0xb32df8e9f3546564,0x47939822dc96abf9,
-    0xdff9772470297ebd,0x59787e2b93bc56f7,
-    0x8bfbea76c619ef36,0x57eb4edb3c55b65a,
-    0xaefae51477a06b03,0xede622920b6b23f1,
-    0xdab99e59958885c4,0xe95fab368e45eced,
-    0x88b402f7fd75539b,0x11dbcb0218ebb414,
-    0xaae103b5fcd2a881,0xd652bdc29f26a119,
-    0xd59944a37c0752a2,0x4be76d3346f0495f,
-    0x857fcae62d8493a5,0x6f70a4400c562ddb,
-    0xa6dfbd9fb8e5b88e,0xcb4ccd500f6bb952,
-    0xd097ad07a71f26b2,0x7e2000a41346a7a7,
-    0x825ecc24c873782f,0x8ed400668c0c28c8,
-    0xa2f67f2dfa90563b,0x728900802f0f32fa,
-    0xcbb41ef979346bca,0x4f2b40a03ad2ffb9,
-    0xfea126b7d78186bc,0xe2f610c84987bfa8,
-    0x9f24b832e6b0f436,0xdd9ca7d2df4d7c9,
-    0xc6ede63fa05d3143,0x91503d1c79720dbb,
-    0xf8a95fcf88747d94,0x75a44c6397ce912a,
-    0x9b69dbe1b548ce7c,0xc986afbe3ee11aba,
-    0xc24452da229b021b,0xfbe85badce996168,
-    0xf2d56790ab41c2a2,0xfae27299423fb9c3,
-    0x97c560ba6b0919a5,0xdccd879fc967d41a,
-    0xbdb6b8e905cb600f,0x5400e987bbc1c920,
-    0xed246723473e3813,0x290123e9aab23b68,
-    0x9436c0760c86e30b,0xf9a0b6720aaf6521,
-    0xb94470938fa89bce,0xf808e40e8d5b3e69,
-    0xe7958cb87392c2c2,0xb60b1d1230b20e04,
-    0x90bd77f3483bb9b9,0xb1c6f22b5e6f48c2,
-    0xb4ecd5f01a4aa828,0x1e38aeb6360b1af3,
-    0xe2280b6c20dd5232,0x25c6da63c38de1b0,
-    0x8d590723948a535f,0x579c487e5a38ad0e,
-    0xb0af48ec79ace837,0x2d835a9df0c6d851,
-    0xdcdb1b2798182244,0xf8e431456cf88e65,
-    0x8a08f0f8bf0f156b,0x1b8e9ecb641b58ff,
-    0xac8b2d36eed2dac5,0xe272467e3d222f3f,
-    0xd7adf884aa879177,0x5b0ed81dcc6abb0f,
-    0x86ccbb52ea94baea,0x98e947129fc2b4e9,
-    0xa87fea27a539e9a5,0x3f2398d747b36224,
-    0xd29fe4b18e88640e,0x8eec7f0d19a03aad,
-    0x83a3eeeef9153e89,0x1953cf68300424ac,
-    0xa48ceaaab75a8e2b,0x5fa8c3423c052dd7,
-    0xcdb02555653131b6,0x3792f412cb06794d,
-    0x808e17555f3ebf11,0xe2bbd88bbee40bd0,
-    0xa0b19d2ab70e6ed6,0x5b6aceaeae9d0ec4,
-    0xc8de047564d20a8b,0xf245825a5a445275,
-    0xfb158592be068d2e,0xeed6e2f0f0d56712,
-    0x9ced737bb6c4183d,0x55464dd69685606b,
-    0xc428d05aa4751e4c,0xaa97e14c3c26b886,
-    0xf53304714d9265df,0xd53dd99f4b3066a8,
-    0x993fe2c6d07b7fab,0xe546a8038efe4029,
-    0xbf8fdb78849a5f96,0xde98520472bdd033,
-    0xef73d256a5c0f77c,0x963e66858f6d4440,
-    0x95a8637627989aad,0xdde7001379a44aa8,
-    0xbb127c53b17ec159,0x5560c018580d5d52,
-    0xe9d71b689dde71af,0xaab8f01e6e10b4a6,
-    0x9226712162ab070d,0xcab3961304ca70e8,
-    0xb6b00d69bb55c8d1,0x3d607b97c5fd0d22,
-    0xe45c10c42a2b3b05,0x8cb89a7db77c506a,
-    0x8eb98a7a9a5b04e3,0x77f3608e92adb242,
-    0xb267ed1940f1c61c,0x55f038b237591ed3,
-    0xdf01e85f912e37a3,0x6b6c46dec52f6688,
-    0x8b61313bbabce2c6,0x2323ac4b3b3da015,
-    0xae397d8aa96c1b77,0xabec975e0a0d081a,
-    0xd9c7dced53c72255,0x96e7bd358c904a21,
-    0x881cea14545c7575,0x7e50d64177da2e54,
-    0xaa242499697392d2,0xdde50bd1d5d0b9e9,
-    0xd4ad2dbfc3d07787,0x955e4ec64b44e864,
-    0x84ec3c97da624ab4,0xbd5af13bef0b113e,
-    0xa6274bbdd0fadd61,0xecb1ad8aeacdd58e,
-    0xcfb11ead453994ba,0x67de18eda5814af2,
-    0x81ceb32c4b43fcf4,0x80eacf948770ced7,
-    0xa2425ff75e14fc31,0xa1258379a94d028d,
-    0xcad2f7f5359a3b3e,0x96ee45813a04330,
-    0xfd87b5f28300ca0d,0x8bca9d6e188853fc,
-    0x9e74d1b791e07e48,0x775ea264cf55347e,
-    0xc612062576589dda,0x95364afe032a819e,
-    0xf79687aed3eec551,0x3a83ddbd83f52205,
-    0x9abe14cd44753b52,0xc4926a9672793543,
-    0xc16d9a0095928a27,0x75b7053c0f178294,
-    0xf1c90080baf72cb1,0x5324c68b12dd6339,
-    0x971da05074da7bee,0xd3f6fc16ebca5e04,
-    0xbce5086492111aea,0x88f4bb1ca6bcf585,
-    0xec1e4a7db69561a5,0x2b31e9e3d06c32e6,
-    0x9392ee8e921d5d07,0x3aff322e62439fd0,
-    0xb877aa3236a4b449,0x9befeb9fad487c3,
-    0xe69594bec44de15b,0x4c2ebe687989a9b4,
-    0x901d7cf73ab0acd9,0xf9d37014bf60a11,
-    0xb424dc35095cd80f,0x538484c19ef38c95,
-    0xe12e13424bb40e13,0x2865a5f206b06fba,
-    0x8cbccc096f5088cb,0xf93f87b7442e45d4,
-    0xafebff0bcb24aafe,0xf78f69a51539d749,
-    0xdbe6fecebdedd5be,0xb573440e5a884d1c,
-    0x89705f4136b4a597,0x31680a88f8953031,
-    0xabcc77118461cefc,0xfdc20d2b36ba7c3e,
-    0xd6bf94d5e57a42bc,0x3d32907604691b4d,
-    0x8637bd05af6c69b5,0xa63f9a49c2c1b110,
-    0xa7c5ac471b478423,0xfcf80dc33721d54,
-    0xd1b71758e219652b,0xd3c36113404ea4a9,
-    0x83126e978d4fdf3b,0x645a1cac083126ea,
-    0xa3d70a3d70a3d70a,0x3d70a3d70a3d70a4,
-    0xcccccccccccccccc,0xcccccccccccccccd,
-    0x8000000000000000,0x0,
-    0xa000000000000000,0x0,
-    0xc800000000000000,0x0,
-    0xfa00000000000000,0x0,
-    0x9c40000000000000,0x0,
-    0xc350000000000000,0x0,
-    0xf424000000000000,0x0,
-    0x9896800000000000,0x0,
-    0xbebc200000000000,0x0,
-    0xee6b280000000000,0x0,
-    0x9502f90000000000,0x0,
-    0xba43b74000000000,0x0,
-    0xe8d4a51000000000,0x0,
-    0x9184e72a00000000,0x0,
-    0xb5e620f480000000,0x0,
-    0xe35fa931a0000000,0x0,
-    0x8e1bc9bf04000000,0x0,
-    0xb1a2bc2ec5000000,0x0,
-    0xde0b6b3a76400000,0x0,
-    0x8ac7230489e80000,0x0,
-    0xad78ebc5ac620000,0x0,
-    0xd8d726b7177a8000,0x0,
-    0x878678326eac9000,0x0,
-    0xa968163f0a57b400,0x0,
-    0xd3c21bcecceda100,0x0,
-    0x84595161401484a0,0x0,
-    0xa56fa5b99019a5c8,0x0,
-    0xcecb8f27f4200f3a,0x0,
-    0x813f3978f8940984,0x4000000000000000,
-    0xa18f07d736b90be5,0x5000000000000000,
-    0xc9f2c9cd04674ede,0xa400000000000000,
-    0xfc6f7c4045812296,0x4d00000000000000,
-    0x9dc5ada82b70b59d,0xf020000000000000,
-    0xc5371912364ce305,0x6c28000000000000,
-    0xf684df56c3e01bc6,0xc732000000000000,
-    0x9a130b963a6c115c,0x3c7f400000000000,
-    0xc097ce7bc90715b3,0x4b9f100000000000,
-    0xf0bdc21abb48db20,0x1e86d40000000000,
-    0x96769950b50d88f4,0x1314448000000000,
-    0xbc143fa4e250eb31,0x17d955a000000000,
-    0xeb194f8e1ae525fd,0x5dcfab0800000000,
-    0x92efd1b8d0cf37be,0x5aa1cae500000000,
-    0xb7abc627050305ad,0xf14a3d9e40000000,
-    0xe596b7b0c643c719,0x6d9ccd05d0000000,
-    0x8f7e32ce7bea5c6f,0xe4820023a2000000,
-    0xb35dbf821ae4f38b,0xdda2802c8a800000,
-    0xe0352f62a19e306e,0xd50b2037ad200000,
-    0x8c213d9da502de45,0x4526f422cc340000,
-    0xaf298d050e4395d6,0x9670b12b7f410000,
-    0xdaf3f04651d47b4c,0x3c0cdd765f114000,
-    0x88d8762bf324cd0f,0xa5880a69fb6ac800,
-    0xab0e93b6efee0053,0x8eea0d047a457a00,
-    0xd5d238a4abe98068,0x72a4904598d6d880,
-    0x85a36366eb71f041,0x47a6da2b7f864750,
-    0xa70c3c40a64e6c51,0x999090b65f67d924,
-    0xd0cf4b50cfe20765,0xfff4b4e3f741cf6d,
-    0x82818f1281ed449f,0xbff8f10e7a8921a4,
-    0xa321f2d7226895c7,0xaff72d52192b6a0d,
-    0xcbea6f8ceb02bb39,0x9bf4f8a69f764490,
-    0xfee50b7025c36a08,0x2f236d04753d5b4,
-    0x9f4f2726179a2245,0x1d762422c946590,
-    0xc722f0ef9d80aad6,0x424d3ad2b7b97ef5,
-    0xf8ebad2b84e0d58b,0xd2e0898765a7deb2,
-    0x9b934c3b330c8577,0x63cc55f49f88eb2f,
-    0xc2781f49ffcfa6d5,0x3cbf6b71c76b25fb,
-    0xf316271c7fc3908a,0x8bef464e3945ef7a,
-    0x97edd871cfda3a56,0x97758bf0e3cbb5ac,
-    0xbde94e8e43d0c8ec,0x3d52eeed1cbea317,
-    0xed63a231d4c4fb27,0x4ca7aaa863ee4bdd,
-    0x945e455f24fb1cf8,0x8fe8caa93e74ef6a,
-    0xb975d6b6ee39e436,0xb3e2fd538e122b44,
-    0xe7d34c64a9c85d44,0x60dbbca87196b616,
-    0x90e40fbeea1d3a4a,0xbc8955e946fe31cd,
-    0xb51d13aea4a488dd,0x6babab6398bdbe41,
-    0xe264589a4dcdab14,0xc696963c7eed2dd1,
-    0x8d7eb76070a08aec,0xfc1e1de5cf543ca2,
-    0xb0de65388cc8ada8,0x3b25a55f43294bcb,
-    0xdd15fe86affad912,0x49ef0eb713f39ebe,
-    0x8a2dbf142dfcc7ab,0x6e3569326c784337,
-    0xacb92ed9397bf996,0x49c2c37f07965404,
-    0xd7e77a8f87daf7fb,0xdc33745ec97be906,
-    0x86f0ac99b4e8dafd,0x69a028bb3ded71a3,
-    0xa8acd7c0222311bc,0xc40832ea0d68ce0c,
-    0xd2d80db02aabd62b,0xf50a3fa490c30190,
-    0x83c7088e1aab65db,0x792667c6da79e0fa,
-    0xa4b8cab1a1563f52,0x577001b891185938,
-    0xcde6fd5e09abcf26,0xed4c0226b55e6f86,
-    0x80b05e5ac60b6178,0x544f8158315b05b4,
-    0xa0dc75f1778e39d6,0x696361ae3db1c721,
-    0xc913936dd571c84c,0x3bc3a19cd1e38e9,
-    0xfb5878494ace3a5f,0x4ab48a04065c723,
-    0x9d174b2dcec0e47b,0x62eb0d64283f9c76,
-    0xc45d1df942711d9a,0x3ba5d0bd324f8394,
-    0xf5746577930d6500,0xca8f44ec7ee36479,
-    0x9968bf6abbe85f20,0x7e998b13cf4e1ecb,
-    0xbfc2ef456ae276e8,0x9e3fedd8c321a67e,
-    0xefb3ab16c59b14a2,0xc5cfe94ef3ea101e,
-    0x95d04aee3b80ece5,0xbba1f1d158724a12,
-    0xbb445da9ca61281f,0x2a8a6e45ae8edc97,
-    0xea1575143cf97226,0xf52d09d71a3293bd,
-    0x924d692ca61be758,0x593c2626705f9c56,
-    0xb6e0c377cfa2e12e,0x6f8b2fb00c77836c,
-    0xe498f455c38b997a,0xb6dfb9c0f956447,
-    0x8edf98b59a373fec,0x4724bd4189bd5eac,
-    0xb2977ee300c50fe7,0x58edec91ec2cb657,
-    0xdf3d5e9bc0f653e1,0x2f2967b66737e3ed,
-    0x8b865b215899f46c,0xbd79e0d20082ee74,
-    0xae67f1e9aec07187,0xecd8590680a3aa11,
-    0xda01ee641a708de9,0xe80e6f4820cc9495,
-    0x884134fe908658b2,0x3109058d147fdcdd,
-    0xaa51823e34a7eede,0xbd4b46f0599fd415,
-    0xd4e5e2cdc1d1ea96,0x6c9e18ac7007c91a,
-    0x850fadc09923329e,0x3e2cf6bc604ddb0,
-    0xa6539930bf6bff45,0x84db8346b786151c,
-    0xcfe87f7cef46ff16,0xe612641865679a63,
-    0x81f14fae158c5f6e,0x4fcb7e8f3f60c07e,
-    0xa26da3999aef7749,0xe3be5e330f38f09d,
-    0xcb090c8001ab551c,0x5cadf5bfd3072cc5,
-    0xfdcb4fa002162a63,0x73d9732fc7c8f7f6,
-    0x9e9f11c4014dda7e,0x2867e7fddcdd9afa,
-    0xc646d63501a1511d,0xb281e1fd541501b8,
-    0xf7d88bc24209a565,0x1f225a7ca91a4226,
-    0x9ae757596946075f,0x3375788de9b06958,
-    0xc1a12d2fc3978937,0x52d6b1641c83ae,
-    0xf209787bb47d6b84,0xc0678c5dbd23a49a,
-    0x9745eb4d50ce6332,0xf840b7ba963646e0,
-    0xbd176620a501fbff,0xb650e5a93bc3d898,
-    0xec5d3fa8ce427aff,0xa3e51f138ab4cebe,
-    0x93ba47c980e98cdf,0xc66f336c36b10137,
-    0xb8a8d9bbe123f017,0xb80b0047445d4184,
-    0xe6d3102ad96cec1d,0xa60dc059157491e5,
-    0x9043ea1ac7e41392,0x87c89837ad68db2f,
-    0xb454e4a179dd1877,0x29babe4598c311fb,
-    0xe16a1dc9d8545e94,0xf4296dd6fef3d67a,
-    0x8ce2529e2734bb1d,0x1899e4a65f58660c,
-    0xb01ae745b101e9e4,0x5ec05dcff72e7f8f,
-    0xdc21a1171d42645d,0x76707543f4fa1f73,
-    0x899504ae72497eba,0x6a06494a791c53a8,
-    0xabfa45da0edbde69,0x487db9d17636892,
-    0xd6f8d7509292d603,0x45a9d2845d3c42b6,
-    0x865b86925b9bc5c2,0xb8a2392ba45a9b2,
-    0xa7f26836f282b732,0x8e6cac7768d7141e,
-    0xd1ef0244af2364ff,0x3207d795430cd926,
-    0x8335616aed761f1f,0x7f44e6bd49e807b8,
-    0xa402b9c5a8d3a6e7,0x5f16206c9c6209a6,
-    0xcd036837130890a1,0x36dba887c37a8c0f,
-    0x802221226be55a64,0xc2494954da2c9789,
-    0xa02aa96b06deb0fd,0xf2db9baa10b7bd6c,
-    0xc83553c5c8965d3d,0x6f92829494e5acc7,
-    0xfa42a8b73abbf48c,0xcb772339ba1f17f9,
-    0x9c69a97284b578d7,0xff2a760414536efb,
-    0xc38413cf25e2d70d,0xfef5138519684aba,
-    0xf46518c2ef5b8cd1,0x7eb258665fc25d69,
-    0x98bf2f79d5993802,0xef2f773ffbd97a61,
-    0xbeeefb584aff8603,0xaafb550ffacfd8fa,
-    0xeeaaba2e5dbf6784,0x95ba2a53f983cf38,
-    0x952ab45cfa97a0b2,0xdd945a747bf26183,
-    0xba756174393d88df,0x94f971119aeef9e4,
-    0xe912b9d1478ceb17,0x7a37cd5601aab85d,
-    0x91abb422ccb812ee,0xac62e055c10ab33a,
-    0xb616a12b7fe617aa,0x577b986b314d6009,
-    0xe39c49765fdf9d94,0xed5a7e85fda0b80b,
-    0x8e41ade9fbebc27d,0x14588f13be847307,
-    0xb1d219647ae6b31c,0x596eb2d8ae258fc8,
-    0xde469fbd99a05fe3,0x6fca5f8ed9aef3bb,
-    0x8aec23d680043bee,0x25de7bb9480d5854,
-    0xada72ccc20054ae9,0xaf561aa79a10ae6a,
-    0xd910f7ff28069da4,0x1b2ba1518094da04,
-    0x87aa9aff79042286,0x90fb44d2f05d0842,
-    0xa99541bf57452b28,0x353a1607ac744a53,
-    0xd3fa922f2d1675f2,0x42889b8997915ce8,
-    0x847c9b5d7c2e09b7,0x69956135febada11,
-    0xa59bc234db398c25,0x43fab9837e699095,
-    0xcf02b2c21207ef2e,0x94f967e45e03f4bb,
-    0x8161afb94b44f57d,0x1d1be0eebac278f5,
-    0xa1ba1ba79e1632dc,0x6462d92a69731732,
-    0xca28a291859bbf93,0x7d7b8f7503cfdcfe,
-    0xfcb2cb35e702af78,0x5cda735244c3d43e,
-    0x9defbf01b061adab,0x3a0888136afa64a7,
-    0xc56baec21c7a1916,0x88aaa1845b8fdd0,
-    0xf6c69a72a3989f5b,0x8aad549e57273d45,
-    0x9a3c2087a63f6399,0x36ac54e2f678864b,
-    0xc0cb28a98fcf3c7f,0x84576a1bb416a7dd,
-    0xf0fdf2d3f3c30b9f,0x656d44a2a11c51d5,
-    0x969eb7c47859e743,0x9f644ae5a4b1b325,
-    0xbc4665b596706114,0x873d5d9f0dde1fee,
-    0xeb57ff22fc0c7959,0xa90cb506d155a7ea,
-    0x9316ff75dd87cbd8,0x9a7f12442d588f2,
-    0xb7dcbf5354e9bece,0xc11ed6d538aeb2f,
-    0xe5d3ef282a242e81,0x8f1668c8a86da5fa,
-    0x8fa475791a569d10,0xf96e017d694487bc,
-    0xb38d92d760ec4455,0x37c981dcc395a9ac,
-    0xe070f78d3927556a,0x85bbe253f47b1417,
-    0x8c469ab843b89562,0x93956d7478ccec8e,
-    0xaf58416654a6babb,0x387ac8d1970027b2,
-    0xdb2e51bfe9d0696a,0x6997b05fcc0319e,
-    0x88fcf317f22241e2,0x441fece3bdf81f03,
-    0xab3c2fddeeaad25a,0xd527e81cad7626c3,
-    0xd60b3bd56a5586f1,0x8a71e223d8d3b074,
-    0x85c7056562757456,0xf6872d5667844e49,
-    0xa738c6bebb12d16c,0xb428f8ac016561db,
-    0xd106f86e69d785c7,0xe13336d701beba52,
-    0x82a45b450226b39c,0xecc0024661173473,
-    0xa34d721642b06084,0x27f002d7f95d0190,
-    0xcc20ce9bd35c78a5,0x31ec038df7b441f4,
-    0xff290242c83396ce,0x7e67047175a15271,
-    0x9f79a169bd203e41,0xf0062c6e984d386,
-    0xc75809c42c684dd1,0x52c07b78a3e60868,
-    0xf92e0c3537826145,0xa7709a56ccdf8a82,
-    0x9bbcc7a142b17ccb,0x88a66076400bb691,
-    0xc2abf989935ddbfe,0x6acff893d00ea435,
-    0xf356f7ebf83552fe,0x583f6b8c4124d43,
-    0x98165af37b2153de,0xc3727a337a8b704a,
-    0xbe1bf1b059e9a8d6,0x744f18c0592e4c5c,
-    0xeda2ee1c7064130c,0x1162def06f79df73,
-    0x9485d4d1c63e8be7,0x8addcb5645ac2ba8,
-    0xb9a74a0637ce2ee1,0x6d953e2bd7173692,
-    0xe8111c87c5c1ba99,0xc8fa8db6ccdd0437,
-    0x910ab1d4db9914a0,0x1d9c9892400a22a2,
-    0xb54d5e4a127f59c8,0x2503beb6d00cab4b,
-    0xe2a0b5dc971f303a,0x2e44ae64840fd61d,
-    0x8da471a9de737e24,0x5ceaecfed289e5d2,
-    0xb10d8e1456105dad,0x7425a83e872c5f47,
-    0xdd50f1996b947518,0xd12f124e28f77719,
-    0x8a5296ffe33cc92f,0x82bd6b70d99aaa6f,
-    0xace73cbfdc0bfb7b,0x636cc64d1001550b,
-    0xd8210befd30efa5a,0x3c47f7e05401aa4e,
-    0x8714a775e3e95c78,0x65acfaec34810a71,
-    0xa8d9d1535ce3b396,0x7f1839a741a14d0d,
-    0xd31045a8341ca07c,0x1ede48111209a050,
-    0x83ea2b892091e44d,0x934aed0aab460432,
-    0xa4e4b66b68b65d60,0xf81da84d5617853f,
-    0xce1de40642e3f4b9,0x36251260ab9d668e,
-    0x80d2ae83e9ce78f3,0xc1d72b7c6b426019,
-    0xa1075a24e4421730,0xb24cf65b8612f81f,
-    0xc94930ae1d529cfc,0xdee033f26797b627,
-    0xfb9b7cd9a4a7443c,0x169840ef017da3b1,
-    0x9d412e0806e88aa5,0x8e1f289560ee864e,
-    0xc491798a08a2ad4e,0xf1a6f2bab92a27e2,
-    0xf5b5d7ec8acb58a2,0xae10af696774b1db,
-    0x9991a6f3d6bf1765,0xacca6da1e0a8ef29,
-    0xbff610b0cc6edd3f,0x17fd090a58d32af3,
-    0xeff394dcff8a948e,0xddfc4b4cef07f5b0,
-    0x95f83d0a1fb69cd9,0x4abdaf101564f98e,
-    0xbb764c4ca7a4440f,0x9d6d1ad41abe37f1,
-    0xea53df5fd18d5513,0x84c86189216dc5ed,
-    0x92746b9be2f8552c,0x32fd3cf5b4e49bb4,
-    0xb7118682dbb66a77,0x3fbc8c33221dc2a1,
-    0xe4d5e82392a40515,0xfabaf3feaa5334a,
-    0x8f05b1163ba6832d,0x29cb4d87f2a7400e,
-    0xb2c71d5bca9023f8,0x743e20e9ef511012,
-    0xdf78e4b2bd342cf6,0x914da9246b255416,
-    0x8bab8eefb6409c1a,0x1ad089b6c2f7548e,
-    0xae9672aba3d0c320,0xa184ac2473b529b1,
-    0xda3c0f568cc4f3e8,0xc9e5d72d90a2741e,
-    0x8865899617fb1871,0x7e2fa67c7a658892,
-    0xaa7eebfb9df9de8d,0xddbb901b98feeab7,
-    0xd51ea6fa85785631,0x552a74227f3ea565,
-    0x8533285c936b35de,0xd53a88958f87275f,
-    0xa67ff273b8460356,0x8a892abaf368f137,
-    0xd01fef10a657842c,0x2d2b7569b0432d85,
-    0x8213f56a67f6b29b,0x9c3b29620e29fc73,
-    0xa298f2c501f45f42,0x8349f3ba91b47b8f,
-    0xcb3f2f7642717713,0x241c70a936219a73,
-    0xfe0efb53d30dd4d7,0xed238cd383aa0110,
-    0x9ec95d1463e8a506,0xf4363804324a40aa,
-    0xc67bb4597ce2ce48,0xb143c6053edcd0d5,
-    0xf81aa16fdc1b81da,0xdd94b7868e94050a,
-    0x9b10a4e5e9913128,0xca7cf2b4191c8326,
-    0xc1d4ce1f63f57d72,0xfd1c2f611f63a3f0,
-    0xf24a01a73cf2dccf,0xbc633b39673c8cec,
-    0x976e41088617ca01,0xd5be0503e085d813,
-    0xbd49d14aa79dbc82,0x4b2d8644d8a74e18,
-    0xec9c459d51852ba2,0xddf8e7d60ed1219e,
-    0x93e1ab8252f33b45,0xcabb90e5c942b503,
-    0xb8da1662e7b00a17,0x3d6a751f3b936243,
-    0xe7109bfba19c0c9d,0xcc512670a783ad4,
-    0x906a617d450187e2,0x27fb2b80668b24c5,
-    0xb484f9dc9641e9da,0xb1f9f660802dedf6,
-    0xe1a63853bbd26451,0x5e7873f8a0396973,
-    0x8d07e33455637eb2,0xdb0b487b6423e1e8,
-    0xb049dc016abc5e5f,0x91ce1a9a3d2cda62,
-    0xdc5c5301c56b75f7,0x7641a140cc7810fb,
-    0x89b9b3e11b6329ba,0xa9e904c87fcb0a9d,
-    0xac2820d9623bf429,0x546345fa9fbdcd44,
-    0xd732290fbacaf133,0xa97c177947ad4095,
-    0x867f59a9d4bed6c0,0x49ed8eabcccc485d,
-    0xa81f301449ee8c70,0x5c68f256bfff5a74,
-    0xd226fc195c6a2f8c,0x73832eec6fff3111,
-    0x83585d8fd9c25db7,0xc831fd53c5ff7eab,
-    0xa42e74f3d032f525,0xba3e7ca8b77f5e55,
-    0xcd3a1230c43fb26f,0x28ce1bd2e55f35eb,
-    0x80444b5e7aa7cf85,0x7980d163cf5b81b3,
-    0xa0555e361951c366,0xd7e105bcc332621f,
-    0xc86ab5c39fa63440,0x8dd9472bf3fefaa7,
-    0xfa856334878fc150,0xb14f98f6f0feb951,
-    0x9c935e00d4b9d8d2,0x6ed1bf9a569f33d3,
-    0xc3b8358109e84f07,0xa862f80ec4700c8,
-    0xf4a642e14c6262c8,0xcd27bb612758c0fa,
-    0x98e7e9cccfbd7dbd,0x8038d51cb897789c,
-    0xbf21e44003acdd2c,0xe0470a63e6bd56c3,
-    0xeeea5d5004981478,0x1858ccfce06cac74,
-    0x95527a5202df0ccb,0xf37801e0c43ebc8,
-    0xbaa718e68396cffd,0xd30560258f54e6ba,
-    0xe950df20247c83fd,0x47c6b82ef32a2069,
-    0x91d28b7416cdd27e,0x4cdc331d57fa5441,
-    0xb6472e511c81471d,0xe0133fe4adf8e952,
-    0xe3d8f9e563a198e5,0x58180fddd97723a6,
-    0x8e679c2f5e44ff8f,0x570f09eaa7ea7648,};
+  constexpr static int smallest_power_of_five =
+      binary_format<double>::smallest_power_of_ten();
+  constexpr static int largest_power_of_five =
+      binary_format<double>::largest_power_of_ten();
+  constexpr static int number_of_entries =
+      2 * (largest_power_of_five - smallest_power_of_five + 1);
+  // Powers of five from 5^-342 all the way to 5^308 rounded toward one.
+  constexpr static uint64_t power_of_five_128[number_of_entries] = {
+      0xeef453d6923bd65a, 0x113faa2906a13b3f,
+      0x9558b4661b6565f8, 0x4ac7ca59a424c507,
+      0xbaaee17fa23ebf76, 0x5d79bcf00d2df649,
+      0xe95a99df8ace6f53, 0xf4d82c2c107973dc,
+      0x91d8a02bb6c10594, 0x79071b9b8a4be869,
+      0xb64ec836a47146f9, 0x9748e2826cdee284,
+      0xe3e27a444d8d98b7, 0xfd1b1b2308169b25,
+      0x8e6d8c6ab0787f72, 0xfe30f0f5e50e20f7,
+      0xb208ef855c969f4f, 0xbdbd2d335e51a935,
+      0xde8b2b66b3bc4723, 0xad2c788035e61382,
+      0x8b16fb203055ac76, 0x4c3bcb5021afcc31,
+      0xaddcb9e83c6b1793, 0xdf4abe242a1bbf3d,
+      0xd953e8624b85dd78, 0xd71d6dad34a2af0d,
+      0x87d4713d6f33aa6b, 0x8672648c40e5ad68,
+      0xa9c98d8ccb009506, 0x680efdaf511f18c2,
+      0xd43bf0effdc0ba48, 0x212bd1b2566def2,
+      0x84a57695fe98746d, 0x14bb630f7604b57,
+      0xa5ced43b7e3e9188, 0x419ea3bd35385e2d,
+      0xcf42894a5dce35ea, 0x52064cac828675b9,
+      0x818995ce7aa0e1b2, 0x7343efebd1940993,
+      0xa1ebfb4219491a1f, 0x1014ebe6c5f90bf8,
+      0xca66fa129f9b60a6, 0xd41a26e077774ef6,
+      0xfd00b897478238d0, 0x8920b098955522b4,
+      0x9e20735e8cb16382, 0x55b46e5f5d5535b0,
+      0xc5a890362fddbc62, 0xeb2189f734aa831d,
+      0xf712b443bbd52b7b, 0xa5e9ec7501d523e4,
+      0x9a6bb0aa55653b2d, 0x47b233c92125366e,
+      0xc1069cd4eabe89f8, 0x999ec0bb696e840a,
+      0xf148440a256e2c76, 0xc00670ea43ca250d,
+      0x96cd2a865764dbca, 0x380406926a5e5728,
+      0xbc807527ed3e12bc, 0xc605083704f5ecf2,
+      0xeba09271e88d976b, 0xf7864a44c633682e,
+      0x93445b8731587ea3, 0x7ab3ee6afbe0211d,
+      0xb8157268fdae9e4c, 0x5960ea05bad82964,
+      0xe61acf033d1a45df, 0x6fb92487298e33bd,
+      0x8fd0c16206306bab, 0xa5d3b6d479f8e056,
+      0xb3c4f1ba87bc8696, 0x8f48a4899877186c,
+      0xe0b62e2929aba83c, 0x331acdabfe94de87,
+      0x8c71dcd9ba0b4925, 0x9ff0c08b7f1d0b14,
+      0xaf8e5410288e1b6f, 0x7ecf0ae5ee44dd9,
+      0xdb71e91432b1a24a, 0xc9e82cd9f69d6150,
+      0x892731ac9faf056e, 0xbe311c083a225cd2,
+      0xab70fe17c79ac6ca, 0x6dbd630a48aaf406,
+      0xd64d3d9db981787d, 0x92cbbccdad5b108,
+      0x85f0468293f0eb4e, 0x25bbf56008c58ea5,
+      0xa76c582338ed2621, 0xaf2af2b80af6f24e,
+      0xd1476e2c07286faa, 0x1af5af660db4aee1,
+      0x82cca4db847945ca, 0x50d98d9fc890ed4d,
+      0xa37fce126597973c, 0xe50ff107bab528a0,
+      0xcc5fc196fefd7d0c, 0x1e53ed49a96272c8,
+      0xff77b1fcbebcdc4f, 0x25e8e89c13bb0f7a,
+      0x9faacf3df73609b1, 0x77b191618c54e9ac,
+      0xc795830d75038c1d, 0xd59df5b9ef6a2417,
+      0xf97ae3d0d2446f25, 0x4b0573286b44ad1d,
+      0x9becce62836ac577, 0x4ee367f9430aec32,
+      0xc2e801fb244576d5, 0x229c41f793cda73f,
+      0xf3a20279ed56d48a, 0x6b43527578c1110f,
+      0x9845418c345644d6, 0x830a13896b78aaa9,
+      0xbe5691ef416bd60c, 0x23cc986bc656d553,
+      0xedec366b11c6cb8f, 0x2cbfbe86b7ec8aa8,
+      0x94b3a202eb1c3f39, 0x7bf7d71432f3d6a9,
+      0xb9e08a83a5e34f07, 0xdaf5ccd93fb0cc53,
+      0xe858ad248f5c22c9, 0xd1b3400f8f9cff68,
+      0x91376c36d99995be, 0x23100809b9c21fa1,
+      0xb58547448ffffb2d, 0xabd40a0c2832a78a,
+      0xe2e69915b3fff9f9, 0x16c90c8f323f516c,
+      0x8dd01fad907ffc3b, 0xae3da7d97f6792e3,
+      0xb1442798f49ffb4a, 0x99cd11cfdf41779c,
+      0xdd95317f31c7fa1d, 0x40405643d711d583,
+      0x8a7d3eef7f1cfc52, 0x482835ea666b2572,
+      0xad1c8eab5ee43b66, 0xda3243650005eecf,
+      0xd863b256369d4a40, 0x90bed43e40076a82,
+      0x873e4f75e2224e68, 0x5a7744a6e804a291,
+      0xa90de3535aaae202, 0x711515d0a205cb36,
+      0xd3515c2831559a83, 0xd5a5b44ca873e03,
+      0x8412d9991ed58091, 0xe858790afe9486c2,
+      0xa5178fff668ae0b6, 0x626e974dbe39a872,
+      0xce5d73ff402d98e3, 0xfb0a3d212dc8128f,
+      0x80fa687f881c7f8e, 0x7ce66634bc9d0b99,
+      0xa139029f6a239f72, 0x1c1fffc1ebc44e80,
+      0xc987434744ac874e, 0xa327ffb266b56220,
+      0xfbe9141915d7a922, 0x4bf1ff9f0062baa8,
+      0x9d71ac8fada6c9b5, 0x6f773fc3603db4a9,
+      0xc4ce17b399107c22, 0xcb550fb4384d21d3,
+      0xf6019da07f549b2b, 0x7e2a53a146606a48,
+      0x99c102844f94e0fb, 0x2eda7444cbfc426d,
+      0xc0314325637a1939, 0xfa911155fefb5308,
+      0xf03d93eebc589f88, 0x793555ab7eba27ca,
+      0x96267c7535b763b5, 0x4bc1558b2f3458de,
+      0xbbb01b9283253ca2, 0x9eb1aaedfb016f16,
+      0xea9c227723ee8bcb, 0x465e15a979c1cadc,
+      0x92a1958a7675175f, 0xbfacd89ec191ec9,
+      0xb749faed14125d36, 0xcef980ec671f667b,
+      0xe51c79a85916f484, 0x82b7e12780e7401a,
+      0x8f31cc0937ae58d2, 0xd1b2ecb8b0908810,
+      0xb2fe3f0b8599ef07, 0x861fa7e6dcb4aa15,
+      0xdfbdcece67006ac9, 0x67a791e093e1d49a,
+      0x8bd6a141006042bd, 0xe0c8bb2c5c6d24e0,
+      0xaecc49914078536d, 0x58fae9f773886e18,
+      0xda7f5bf590966848, 0xaf39a475506a899e,
+      0x888f99797a5e012d, 0x6d8406c952429603,
+      0xaab37fd7d8f58178, 0xc8e5087ba6d33b83,
+      0xd5605fcdcf32e1d6, 0xfb1e4a9a90880a64,
+      0x855c3be0a17fcd26, 0x5cf2eea09a55067f,
+      0xa6b34ad8c9dfc06f, 0xf42faa48c0ea481e,
+      0xd0601d8efc57b08b, 0xf13b94daf124da26,
+      0x823c12795db6ce57, 0x76c53d08d6b70858,
+      0xa2cb1717b52481ed, 0x54768c4b0c64ca6e,
+      0xcb7ddcdda26da268, 0xa9942f5dcf7dfd09,
+      0xfe5d54150b090b02, 0xd3f93b35435d7c4c,
+      0x9efa548d26e5a6e1, 0xc47bc5014a1a6daf,
+      0xc6b8e9b0709f109a, 0x359ab6419ca1091b,
+      0xf867241c8cc6d4c0, 0xc30163d203c94b62,
+      0x9b407691d7fc44f8, 0x79e0de63425dcf1d,
+      0xc21094364dfb5636, 0x985915fc12f542e4,
+      0xf294b943e17a2bc4, 0x3e6f5b7b17b2939d,
+      0x979cf3ca6cec5b5a, 0xa705992ceecf9c42,
+      0xbd8430bd08277231, 0x50c6ff782a838353,
+      0xece53cec4a314ebd, 0xa4f8bf5635246428,
+      0x940f4613ae5ed136, 0x871b7795e136be99,
+      0xb913179899f68584, 0x28e2557b59846e3f,
+      0xe757dd7ec07426e5, 0x331aeada2fe589cf,
+      0x9096ea6f3848984f, 0x3ff0d2c85def7621,
+      0xb4bca50b065abe63, 0xfed077a756b53a9,
+      0xe1ebce4dc7f16dfb, 0xd3e8495912c62894,
+      0x8d3360f09cf6e4bd, 0x64712dd7abbbd95c,
+      0xb080392cc4349dec, 0xbd8d794d96aacfb3,
+      0xdca04777f541c567, 0xecf0d7a0fc5583a0,
+      0x89e42caaf9491b60, 0xf41686c49db57244,
+      0xac5d37d5b79b6239, 0x311c2875c522ced5,
+      0xd77485cb25823ac7, 0x7d633293366b828b,
+      0x86a8d39ef77164bc, 0xae5dff9c02033197,
+      0xa8530886b54dbdeb, 0xd9f57f830283fdfc,
+      0xd267caa862a12d66, 0xd072df63c324fd7b,
+      0x8380dea93da4bc60, 0x4247cb9e59f71e6d,
+      0xa46116538d0deb78, 0x52d9be85f074e608,
+      0xcd795be870516656, 0x67902e276c921f8b,
+      0x806bd9714632dff6, 0xba1cd8a3db53b6,
+      0xa086cfcd97bf97f3, 0x80e8a40eccd228a4,
+      0xc8a883c0fdaf7df0, 0x6122cd128006b2cd,
+      0xfad2a4b13d1b5d6c, 0x796b805720085f81,
+      0x9cc3a6eec6311a63, 0xcbe3303674053bb0,
+      0xc3f490aa77bd60fc, 0xbedbfc4411068a9c,
+      0xf4f1b4d515acb93b, 0xee92fb5515482d44,
+      0x991711052d8bf3c5, 0x751bdd152d4d1c4a,
+      0xbf5cd54678eef0b6, 0xd262d45a78a0635d,
+      0xef340a98172aace4, 0x86fb897116c87c34,
+      0x9580869f0e7aac0e, 0xd45d35e6ae3d4da0,
+      0xbae0a846d2195712, 0x8974836059cca109,
+      0xe998d258869facd7, 0x2bd1a438703fc94b,
+      0x91ff83775423cc06, 0x7b6306a34627ddcf,
+      0xb67f6455292cbf08, 0x1a3bc84c17b1d542,
+      0xe41f3d6a7377eeca, 0x20caba5f1d9e4a93,
+      0x8e938662882af53e, 0x547eb47b7282ee9c,
+      0xb23867fb2a35b28d, 0xe99e619a4f23aa43,
+      0xdec681f9f4c31f31, 0x6405fa00e2ec94d4,
+      0x8b3c113c38f9f37e, 0xde83bc408dd3dd04,
+      0xae0b158b4738705e, 0x9624ab50b148d445,
+      0xd98ddaee19068c76, 0x3badd624dd9b0957,
+      0x87f8a8d4cfa417c9, 0xe54ca5d70a80e5d6,
+      0xa9f6d30a038d1dbc, 0x5e9fcf4ccd211f4c,
+      0xd47487cc8470652b, 0x7647c3200069671f,
+      0x84c8d4dfd2c63f3b, 0x29ecd9f40041e073,
+      0xa5fb0a17c777cf09, 0xf468107100525890,
+      0xcf79cc9db955c2cc, 0x7182148d4066eeb4,
+      0x81ac1fe293d599bf, 0xc6f14cd848405530,
+      0xa21727db38cb002f, 0xb8ada00e5a506a7c,
+      0xca9cf1d206fdc03b, 0xa6d90811f0e4851c,
+      0xfd442e4688bd304a, 0x908f4a166d1da663,
+      0x9e4a9cec15763e2e, 0x9a598e4e043287fe,
+      0xc5dd44271ad3cdba, 0x40eff1e1853f29fd,
+      0xf7549530e188c128, 0xd12bee59e68ef47c,
+      0x9a94dd3e8cf578b9, 0x82bb74f8301958ce,
+      0xc13a148e3032d6e7, 0xe36a52363c1faf01,
+      0xf18899b1bc3f8ca1, 0xdc44e6c3cb279ac1,
+      0x96f5600f15a7b7e5, 0x29ab103a5ef8c0b9,
+      0xbcb2b812db11a5de, 0x7415d448f6b6f0e7,
+      0xebdf661791d60f56, 0x111b495b3464ad21,
+      0x936b9fcebb25c995, 0xcab10dd900beec34,
+      0xb84687c269ef3bfb, 0x3d5d514f40eea742,
+      0xe65829b3046b0afa, 0xcb4a5a3112a5112,
+      0x8ff71a0fe2c2e6dc, 0x47f0e785eaba72ab,
+      0xb3f4e093db73a093, 0x59ed216765690f56,
+      0xe0f218b8d25088b8, 0x306869c13ec3532c,
+      0x8c974f7383725573, 0x1e414218c73a13fb,
+      0xafbd2350644eeacf, 0xe5d1929ef90898fa,
+      0xdbac6c247d62a583, 0xdf45f746b74abf39,
+      0x894bc396ce5da772, 0x6b8bba8c328eb783,
+      0xab9eb47c81f5114f, 0x66ea92f3f326564,
+      0xd686619ba27255a2, 0xc80a537b0efefebd,
+      0x8613fd0145877585, 0xbd06742ce95f5f36,
+      0xa798fc4196e952e7, 0x2c48113823b73704,
+      0xd17f3b51fca3a7a0, 0xf75a15862ca504c5,
+      0x82ef85133de648c4, 0x9a984d73dbe722fb,
+      0xa3ab66580d5fdaf5, 0xc13e60d0d2e0ebba,
+      0xcc963fee10b7d1b3, 0x318df905079926a8,
+      0xffbbcfe994e5c61f, 0xfdf17746497f7052,
+      0x9fd561f1fd0f9bd3, 0xfeb6ea8bedefa633,
+      0xc7caba6e7c5382c8, 0xfe64a52ee96b8fc0,
+      0xf9bd690a1b68637b, 0x3dfdce7aa3c673b0,
+      0x9c1661a651213e2d, 0x6bea10ca65c084e,
+      0xc31bfa0fe5698db8, 0x486e494fcff30a62,
+      0xf3e2f893dec3f126, 0x5a89dba3c3efccfa,
+      0x986ddb5c6b3a76b7, 0xf89629465a75e01c,
+      0xbe89523386091465, 0xf6bbb397f1135823,
+      0xee2ba6c0678b597f, 0x746aa07ded582e2c,
+      0x94db483840b717ef, 0xa8c2a44eb4571cdc,
+      0xba121a4650e4ddeb, 0x92f34d62616ce413,
+      0xe896a0d7e51e1566, 0x77b020baf9c81d17,
+      0x915e2486ef32cd60, 0xace1474dc1d122e,
+      0xb5b5ada8aaff80b8, 0xd819992132456ba,
+      0xe3231912d5bf60e6, 0x10e1fff697ed6c69,
+      0x8df5efabc5979c8f, 0xca8d3ffa1ef463c1,
+      0xb1736b96b6fd83b3, 0xbd308ff8a6b17cb2,
+      0xddd0467c64bce4a0, 0xac7cb3f6d05ddbde,
+      0x8aa22c0dbef60ee4, 0x6bcdf07a423aa96b,
+      0xad4ab7112eb3929d, 0x86c16c98d2c953c6,
+      0xd89d64d57a607744, 0xe871c7bf077ba8b7,
+      0x87625f056c7c4a8b, 0x11471cd764ad4972,
+      0xa93af6c6c79b5d2d, 0xd598e40d3dd89bcf,
+      0xd389b47879823479, 0x4aff1d108d4ec2c3,
+      0x843610cb4bf160cb, 0xcedf722a585139ba,
+      0xa54394fe1eedb8fe, 0xc2974eb4ee658828,
+      0xce947a3da6a9273e, 0x733d226229feea32,
+      0x811ccc668829b887, 0x806357d5a3f525f,
+      0xa163ff802a3426a8, 0xca07c2dcb0cf26f7,
+      0xc9bcff6034c13052, 0xfc89b393dd02f0b5,
+      0xfc2c3f3841f17c67, 0xbbac2078d443ace2,
+      0x9d9ba7832936edc0, 0xd54b944b84aa4c0d,
+      0xc5029163f384a931, 0xa9e795e65d4df11,
+      0xf64335bcf065d37d, 0x4d4617b5ff4a16d5,
+      0x99ea0196163fa42e, 0x504bced1bf8e4e45,
+      0xc06481fb9bcf8d39, 0xe45ec2862f71e1d6,
+      0xf07da27a82c37088, 0x5d767327bb4e5a4c,
+      0x964e858c91ba2655, 0x3a6a07f8d510f86f,
+      0xbbe226efb628afea, 0x890489f70a55368b,
+      0xeadab0aba3b2dbe5, 0x2b45ac74ccea842e,
+      0x92c8ae6b464fc96f, 0x3b0b8bc90012929d,
+      0xb77ada0617e3bbcb, 0x9ce6ebb40173744,
+      0xe55990879ddcaabd, 0xcc420a6a101d0515,
+      0x8f57fa54c2a9eab6, 0x9fa946824a12232d,
+      0xb32df8e9f3546564, 0x47939822dc96abf9,
+      0xdff9772470297ebd, 0x59787e2b93bc56f7,
+      0x8bfbea76c619ef36, 0x57eb4edb3c55b65a,
+      0xaefae51477a06b03, 0xede622920b6b23f1,
+      0xdab99e59958885c4, 0xe95fab368e45eced,
+      0x88b402f7fd75539b, 0x11dbcb0218ebb414,
+      0xaae103b5fcd2a881, 0xd652bdc29f26a119,
+      0xd59944a37c0752a2, 0x4be76d3346f0495f,
+      0x857fcae62d8493a5, 0x6f70a4400c562ddb,
+      0xa6dfbd9fb8e5b88e, 0xcb4ccd500f6bb952,
+      0xd097ad07a71f26b2, 0x7e2000a41346a7a7,
+      0x825ecc24c873782f, 0x8ed400668c0c28c8,
+      0xa2f67f2dfa90563b, 0x728900802f0f32fa,
+      0xcbb41ef979346bca, 0x4f2b40a03ad2ffb9,
+      0xfea126b7d78186bc, 0xe2f610c84987bfa8,
+      0x9f24b832e6b0f436, 0xdd9ca7d2df4d7c9,
+      0xc6ede63fa05d3143, 0x91503d1c79720dbb,
+      0xf8a95fcf88747d94, 0x75a44c6397ce912a,
+      0x9b69dbe1b548ce7c, 0xc986afbe3ee11aba,
+      0xc24452da229b021b, 0xfbe85badce996168,
+      0xf2d56790ab41c2a2, 0xfae27299423fb9c3,
+      0x97c560ba6b0919a5, 0xdccd879fc967d41a,
+      0xbdb6b8e905cb600f, 0x5400e987bbc1c920,
+      0xed246723473e3813, 0x290123e9aab23b68,
+      0x9436c0760c86e30b, 0xf9a0b6720aaf6521,
+      0xb94470938fa89bce, 0xf808e40e8d5b3e69,
+      0xe7958cb87392c2c2, 0xb60b1d1230b20e04,
+      0x90bd77f3483bb9b9, 0xb1c6f22b5e6f48c2,
+      0xb4ecd5f01a4aa828, 0x1e38aeb6360b1af3,
+      0xe2280b6c20dd5232, 0x25c6da63c38de1b0,
+      0x8d590723948a535f, 0x579c487e5a38ad0e,
+      0xb0af48ec79ace837, 0x2d835a9df0c6d851,
+      0xdcdb1b2798182244, 0xf8e431456cf88e65,
+      0x8a08f0f8bf0f156b, 0x1b8e9ecb641b58ff,
+      0xac8b2d36eed2dac5, 0xe272467e3d222f3f,
+      0xd7adf884aa879177, 0x5b0ed81dcc6abb0f,
+      0x86ccbb52ea94baea, 0x98e947129fc2b4e9,
+      0xa87fea27a539e9a5, 0x3f2398d747b36224,
+      0xd29fe4b18e88640e, 0x8eec7f0d19a03aad,
+      0x83a3eeeef9153e89, 0x1953cf68300424ac,
+      0xa48ceaaab75a8e2b, 0x5fa8c3423c052dd7,
+      0xcdb02555653131b6, 0x3792f412cb06794d,
+      0x808e17555f3ebf11, 0xe2bbd88bbee40bd0,
+      0xa0b19d2ab70e6ed6, 0x5b6aceaeae9d0ec4,
+      0xc8de047564d20a8b, 0xf245825a5a445275,
+      0xfb158592be068d2e, 0xeed6e2f0f0d56712,
+      0x9ced737bb6c4183d, 0x55464dd69685606b,
+      0xc428d05aa4751e4c, 0xaa97e14c3c26b886,
+      0xf53304714d9265df, 0xd53dd99f4b3066a8,
+      0x993fe2c6d07b7fab, 0xe546a8038efe4029,
+      0xbf8fdb78849a5f96, 0xde98520472bdd033,
+      0xef73d256a5c0f77c, 0x963e66858f6d4440,
+      0x95a8637627989aad, 0xdde7001379a44aa8,
+      0xbb127c53b17ec159, 0x5560c018580d5d52,
+      0xe9d71b689dde71af, 0xaab8f01e6e10b4a6,
+      0x9226712162ab070d, 0xcab3961304ca70e8,
+      0xb6b00d69bb55c8d1, 0x3d607b97c5fd0d22,
+      0xe45c10c42a2b3b05, 0x8cb89a7db77c506a,
+      0x8eb98a7a9a5b04e3, 0x77f3608e92adb242,
+      0xb267ed1940f1c61c, 0x55f038b237591ed3,
+      0xdf01e85f912e37a3, 0x6b6c46dec52f6688,
+      0x8b61313bbabce2c6, 0x2323ac4b3b3da015,
+      0xae397d8aa96c1b77, 0xabec975e0a0d081a,
+      0xd9c7dced53c72255, 0x96e7bd358c904a21,
+      0x881cea14545c7575, 0x7e50d64177da2e54,
+      0xaa242499697392d2, 0xdde50bd1d5d0b9e9,
+      0xd4ad2dbfc3d07787, 0x955e4ec64b44e864,
+      0x84ec3c97da624ab4, 0xbd5af13bef0b113e,
+      0xa6274bbdd0fadd61, 0xecb1ad8aeacdd58e,
+      0xcfb11ead453994ba, 0x67de18eda5814af2,
+      0x81ceb32c4b43fcf4, 0x80eacf948770ced7,
+      0xa2425ff75e14fc31, 0xa1258379a94d028d,
+      0xcad2f7f5359a3b3e, 0x96ee45813a04330,
+      0xfd87b5f28300ca0d, 0x8bca9d6e188853fc,
+      0x9e74d1b791e07e48, 0x775ea264cf55347e,
+      0xc612062576589dda, 0x95364afe032a819e,
+      0xf79687aed3eec551, 0x3a83ddbd83f52205,
+      0x9abe14cd44753b52, 0xc4926a9672793543,
+      0xc16d9a0095928a27, 0x75b7053c0f178294,
+      0xf1c90080baf72cb1, 0x5324c68b12dd6339,
+      0x971da05074da7bee, 0xd3f6fc16ebca5e04,
+      0xbce5086492111aea, 0x88f4bb1ca6bcf585,
+      0xec1e4a7db69561a5, 0x2b31e9e3d06c32e6,
+      0x9392ee8e921d5d07, 0x3aff322e62439fd0,
+      0xb877aa3236a4b449, 0x9befeb9fad487c3,
+      0xe69594bec44de15b, 0x4c2ebe687989a9b4,
+      0x901d7cf73ab0acd9, 0xf9d37014bf60a11,
+      0xb424dc35095cd80f, 0x538484c19ef38c95,
+      0xe12e13424bb40e13, 0x2865a5f206b06fba,
+      0x8cbccc096f5088cb, 0xf93f87b7442e45d4,
+      0xafebff0bcb24aafe, 0xf78f69a51539d749,
+      0xdbe6fecebdedd5be, 0xb573440e5a884d1c,
+      0x89705f4136b4a597, 0x31680a88f8953031,
+      0xabcc77118461cefc, 0xfdc20d2b36ba7c3e,
+      0xd6bf94d5e57a42bc, 0x3d32907604691b4d,
+      0x8637bd05af6c69b5, 0xa63f9a49c2c1b110,
+      0xa7c5ac471b478423, 0xfcf80dc33721d54,
+      0xd1b71758e219652b, 0xd3c36113404ea4a9,
+      0x83126e978d4fdf3b, 0x645a1cac083126ea,
+      0xa3d70a3d70a3d70a, 0x3d70a3d70a3d70a4,
+      0xcccccccccccccccc, 0xcccccccccccccccd,
+      0x8000000000000000, 0x0,
+      0xa000000000000000, 0x0,
+      0xc800000000000000, 0x0,
+      0xfa00000000000000, 0x0,
+      0x9c40000000000000, 0x0,
+      0xc350000000000000, 0x0,
+      0xf424000000000000, 0x0,
+      0x9896800000000000, 0x0,
+      0xbebc200000000000, 0x0,
+      0xee6b280000000000, 0x0,
+      0x9502f90000000000, 0x0,
+      0xba43b74000000000, 0x0,
+      0xe8d4a51000000000, 0x0,
+      0x9184e72a00000000, 0x0,
+      0xb5e620f480000000, 0x0,
+      0xe35fa931a0000000, 0x0,
+      0x8e1bc9bf04000000, 0x0,
+      0xb1a2bc2ec5000000, 0x0,
+      0xde0b6b3a76400000, 0x0,
+      0x8ac7230489e80000, 0x0,
+      0xad78ebc5ac620000, 0x0,
+      0xd8d726b7177a8000, 0x0,
+      0x878678326eac9000, 0x0,
+      0xa968163f0a57b400, 0x0,
+      0xd3c21bcecceda100, 0x0,
+      0x84595161401484a0, 0x0,
+      0xa56fa5b99019a5c8, 0x0,
+      0xcecb8f27f4200f3a, 0x0,
+      0x813f3978f8940984, 0x4000000000000000,
+      0xa18f07d736b90be5, 0x5000000000000000,
+      0xc9f2c9cd04674ede, 0xa400000000000000,
+      0xfc6f7c4045812296, 0x4d00000000000000,
+      0x9dc5ada82b70b59d, 0xf020000000000000,
+      0xc5371912364ce305, 0x6c28000000000000,
+      0xf684df56c3e01bc6, 0xc732000000000000,
+      0x9a130b963a6c115c, 0x3c7f400000000000,
+      0xc097ce7bc90715b3, 0x4b9f100000000000,
+      0xf0bdc21abb48db20, 0x1e86d40000000000,
+      0x96769950b50d88f4, 0x1314448000000000,
+      0xbc143fa4e250eb31, 0x17d955a000000000,
+      0xeb194f8e1ae525fd, 0x5dcfab0800000000,
+      0x92efd1b8d0cf37be, 0x5aa1cae500000000,
+      0xb7abc627050305ad, 0xf14a3d9e40000000,
+      0xe596b7b0c643c719, 0x6d9ccd05d0000000,
+      0x8f7e32ce7bea5c6f, 0xe4820023a2000000,
+      0xb35dbf821ae4f38b, 0xdda2802c8a800000,
+      0xe0352f62a19e306e, 0xd50b2037ad200000,
+      0x8c213d9da502de45, 0x4526f422cc340000,
+      0xaf298d050e4395d6, 0x9670b12b7f410000,
+      0xdaf3f04651d47b4c, 0x3c0cdd765f114000,
+      0x88d8762bf324cd0f, 0xa5880a69fb6ac800,
+      0xab0e93b6efee0053, 0x8eea0d047a457a00,
+      0xd5d238a4abe98068, 0x72a4904598d6d880,
+      0x85a36366eb71f041, 0x47a6da2b7f864750,
+      0xa70c3c40a64e6c51, 0x999090b65f67d924,
+      0xd0cf4b50cfe20765, 0xfff4b4e3f741cf6d,
+      0x82818f1281ed449f, 0xbff8f10e7a8921a4,
+      0xa321f2d7226895c7, 0xaff72d52192b6a0d,
+      0xcbea6f8ceb02bb39, 0x9bf4f8a69f764490,
+      0xfee50b7025c36a08, 0x2f236d04753d5b4,
+      0x9f4f2726179a2245, 0x1d762422c946590,
+      0xc722f0ef9d80aad6, 0x424d3ad2b7b97ef5,
+      0xf8ebad2b84e0d58b, 0xd2e0898765a7deb2,
+      0x9b934c3b330c8577, 0x63cc55f49f88eb2f,
+      0xc2781f49ffcfa6d5, 0x3cbf6b71c76b25fb,
+      0xf316271c7fc3908a, 0x8bef464e3945ef7a,
+      0x97edd871cfda3a56, 0x97758bf0e3cbb5ac,
+      0xbde94e8e43d0c8ec, 0x3d52eeed1cbea317,
+      0xed63a231d4c4fb27, 0x4ca7aaa863ee4bdd,
+      0x945e455f24fb1cf8, 0x8fe8caa93e74ef6a,
+      0xb975d6b6ee39e436, 0xb3e2fd538e122b44,
+      0xe7d34c64a9c85d44, 0x60dbbca87196b616,
+      0x90e40fbeea1d3a4a, 0xbc8955e946fe31cd,
+      0xb51d13aea4a488dd, 0x6babab6398bdbe41,
+      0xe264589a4dcdab14, 0xc696963c7eed2dd1,
+      0x8d7eb76070a08aec, 0xfc1e1de5cf543ca2,
+      0xb0de65388cc8ada8, 0x3b25a55f43294bcb,
+      0xdd15fe86affad912, 0x49ef0eb713f39ebe,
+      0x8a2dbf142dfcc7ab, 0x6e3569326c784337,
+      0xacb92ed9397bf996, 0x49c2c37f07965404,
+      0xd7e77a8f87daf7fb, 0xdc33745ec97be906,
+      0x86f0ac99b4e8dafd, 0x69a028bb3ded71a3,
+      0xa8acd7c0222311bc, 0xc40832ea0d68ce0c,
+      0xd2d80db02aabd62b, 0xf50a3fa490c30190,
+      0x83c7088e1aab65db, 0x792667c6da79e0fa,
+      0xa4b8cab1a1563f52, 0x577001b891185938,
+      0xcde6fd5e09abcf26, 0xed4c0226b55e6f86,
+      0x80b05e5ac60b6178, 0x544f8158315b05b4,
+      0xa0dc75f1778e39d6, 0x696361ae3db1c721,
+      0xc913936dd571c84c, 0x3bc3a19cd1e38e9,
+      0xfb5878494ace3a5f, 0x4ab48a04065c723,
+      0x9d174b2dcec0e47b, 0x62eb0d64283f9c76,
+      0xc45d1df942711d9a, 0x3ba5d0bd324f8394,
+      0xf5746577930d6500, 0xca8f44ec7ee36479,
+      0x9968bf6abbe85f20, 0x7e998b13cf4e1ecb,
+      0xbfc2ef456ae276e8, 0x9e3fedd8c321a67e,
+      0xefb3ab16c59b14a2, 0xc5cfe94ef3ea101e,
+      0x95d04aee3b80ece5, 0xbba1f1d158724a12,
+      0xbb445da9ca61281f, 0x2a8a6e45ae8edc97,
+      0xea1575143cf97226, 0xf52d09d71a3293bd,
+      0x924d692ca61be758, 0x593c2626705f9c56,
+      0xb6e0c377cfa2e12e, 0x6f8b2fb00c77836c,
+      0xe498f455c38b997a, 0xb6dfb9c0f956447,
+      0x8edf98b59a373fec, 0x4724bd4189bd5eac,
+      0xb2977ee300c50fe7, 0x58edec91ec2cb657,
+      0xdf3d5e9bc0f653e1, 0x2f2967b66737e3ed,
+      0x8b865b215899f46c, 0xbd79e0d20082ee74,
+      0xae67f1e9aec07187, 0xecd8590680a3aa11,
+      0xda01ee641a708de9, 0xe80e6f4820cc9495,
+      0x884134fe908658b2, 0x3109058d147fdcdd,
+      0xaa51823e34a7eede, 0xbd4b46f0599fd415,
+      0xd4e5e2cdc1d1ea96, 0x6c9e18ac7007c91a,
+      0x850fadc09923329e, 0x3e2cf6bc604ddb0,
+      0xa6539930bf6bff45, 0x84db8346b786151c,
+      0xcfe87f7cef46ff16, 0xe612641865679a63,
+      0x81f14fae158c5f6e, 0x4fcb7e8f3f60c07e,
+      0xa26da3999aef7749, 0xe3be5e330f38f09d,
+      0xcb090c8001ab551c, 0x5cadf5bfd3072cc5,
+      0xfdcb4fa002162a63, 0x73d9732fc7c8f7f6,
+      0x9e9f11c4014dda7e, 0x2867e7fddcdd9afa,
+      0xc646d63501a1511d, 0xb281e1fd541501b8,
+      0xf7d88bc24209a565, 0x1f225a7ca91a4226,
+      0x9ae757596946075f, 0x3375788de9b06958,
+      0xc1a12d2fc3978937, 0x52d6b1641c83ae,
+      0xf209787bb47d6b84, 0xc0678c5dbd23a49a,
+      0x9745eb4d50ce6332, 0xf840b7ba963646e0,
+      0xbd176620a501fbff, 0xb650e5a93bc3d898,
+      0xec5d3fa8ce427aff, 0xa3e51f138ab4cebe,
+      0x93ba47c980e98cdf, 0xc66f336c36b10137,
+      0xb8a8d9bbe123f017, 0xb80b0047445d4184,
+      0xe6d3102ad96cec1d, 0xa60dc059157491e5,
+      0x9043ea1ac7e41392, 0x87c89837ad68db2f,
+      0xb454e4a179dd1877, 0x29babe4598c311fb,
+      0xe16a1dc9d8545e94, 0xf4296dd6fef3d67a,
+      0x8ce2529e2734bb1d, 0x1899e4a65f58660c,
+      0xb01ae745b101e9e4, 0x5ec05dcff72e7f8f,
+      0xdc21a1171d42645d, 0x76707543f4fa1f73,
+      0x899504ae72497eba, 0x6a06494a791c53a8,
+      0xabfa45da0edbde69, 0x487db9d17636892,
+      0xd6f8d7509292d603, 0x45a9d2845d3c42b6,
+      0x865b86925b9bc5c2, 0xb8a2392ba45a9b2,
+      0xa7f26836f282b732, 0x8e6cac7768d7141e,
+      0xd1ef0244af2364ff, 0x3207d795430cd926,
+      0x8335616aed761f1f, 0x7f44e6bd49e807b8,
+      0xa402b9c5a8d3a6e7, 0x5f16206c9c6209a6,
+      0xcd036837130890a1, 0x36dba887c37a8c0f,
+      0x802221226be55a64, 0xc2494954da2c9789,
+      0xa02aa96b06deb0fd, 0xf2db9baa10b7bd6c,
+      0xc83553c5c8965d3d, 0x6f92829494e5acc7,
+      0xfa42a8b73abbf48c, 0xcb772339ba1f17f9,
+      0x9c69a97284b578d7, 0xff2a760414536efb,
+      0xc38413cf25e2d70d, 0xfef5138519684aba,
+      0xf46518c2ef5b8cd1, 0x7eb258665fc25d69,
+      0x98bf2f79d5993802, 0xef2f773ffbd97a61,
+      0xbeeefb584aff8603, 0xaafb550ffacfd8fa,
+      0xeeaaba2e5dbf6784, 0x95ba2a53f983cf38,
+      0x952ab45cfa97a0b2, 0xdd945a747bf26183,
+      0xba756174393d88df, 0x94f971119aeef9e4,
+      0xe912b9d1478ceb17, 0x7a37cd5601aab85d,
+      0x91abb422ccb812ee, 0xac62e055c10ab33a,
+      0xb616a12b7fe617aa, 0x577b986b314d6009,
+      0xe39c49765fdf9d94, 0xed5a7e85fda0b80b,
+      0x8e41ade9fbebc27d, 0x14588f13be847307,
+      0xb1d219647ae6b31c, 0x596eb2d8ae258fc8,
+      0xde469fbd99a05fe3, 0x6fca5f8ed9aef3bb,
+      0x8aec23d680043bee, 0x25de7bb9480d5854,
+      0xada72ccc20054ae9, 0xaf561aa79a10ae6a,
+      0xd910f7ff28069da4, 0x1b2ba1518094da04,
+      0x87aa9aff79042286, 0x90fb44d2f05d0842,
+      0xa99541bf57452b28, 0x353a1607ac744a53,
+      0xd3fa922f2d1675f2, 0x42889b8997915ce8,
+      0x847c9b5d7c2e09b7, 0x69956135febada11,
+      0xa59bc234db398c25, 0x43fab9837e699095,
+      0xcf02b2c21207ef2e, 0x94f967e45e03f4bb,
+      0x8161afb94b44f57d, 0x1d1be0eebac278f5,
+      0xa1ba1ba79e1632dc, 0x6462d92a69731732,
+      0xca28a291859bbf93, 0x7d7b8f7503cfdcfe,
+      0xfcb2cb35e702af78, 0x5cda735244c3d43e,
+      0x9defbf01b061adab, 0x3a0888136afa64a7,
+      0xc56baec21c7a1916, 0x88aaa1845b8fdd0,
+      0xf6c69a72a3989f5b, 0x8aad549e57273d45,
+      0x9a3c2087a63f6399, 0x36ac54e2f678864b,
+      0xc0cb28a98fcf3c7f, 0x84576a1bb416a7dd,
+      0xf0fdf2d3f3c30b9f, 0x656d44a2a11c51d5,
+      0x969eb7c47859e743, 0x9f644ae5a4b1b325,
+      0xbc4665b596706114, 0x873d5d9f0dde1fee,
+      0xeb57ff22fc0c7959, 0xa90cb506d155a7ea,
+      0x9316ff75dd87cbd8, 0x9a7f12442d588f2,
+      0xb7dcbf5354e9bece, 0xc11ed6d538aeb2f,
+      0xe5d3ef282a242e81, 0x8f1668c8a86da5fa,
+      0x8fa475791a569d10, 0xf96e017d694487bc,
+      0xb38d92d760ec4455, 0x37c981dcc395a9ac,
+      0xe070f78d3927556a, 0x85bbe253f47b1417,
+      0x8c469ab843b89562, 0x93956d7478ccec8e,
+      0xaf58416654a6babb, 0x387ac8d1970027b2,
+      0xdb2e51bfe9d0696a, 0x6997b05fcc0319e,
+      0x88fcf317f22241e2, 0x441fece3bdf81f03,
+      0xab3c2fddeeaad25a, 0xd527e81cad7626c3,
+      0xd60b3bd56a5586f1, 0x8a71e223d8d3b074,
+      0x85c7056562757456, 0xf6872d5667844e49,
+      0xa738c6bebb12d16c, 0xb428f8ac016561db,
+      0xd106f86e69d785c7, 0xe13336d701beba52,
+      0x82a45b450226b39c, 0xecc0024661173473,
+      0xa34d721642b06084, 0x27f002d7f95d0190,
+      0xcc20ce9bd35c78a5, 0x31ec038df7b441f4,
+      0xff290242c83396ce, 0x7e67047175a15271,
+      0x9f79a169bd203e41, 0xf0062c6e984d386,
+      0xc75809c42c684dd1, 0x52c07b78a3e60868,
+      0xf92e0c3537826145, 0xa7709a56ccdf8a82,
+      0x9bbcc7a142b17ccb, 0x88a66076400bb691,
+      0xc2abf989935ddbfe, 0x6acff893d00ea435,
+      0xf356f7ebf83552fe, 0x583f6b8c4124d43,
+      0x98165af37b2153de, 0xc3727a337a8b704a,
+      0xbe1bf1b059e9a8d6, 0x744f18c0592e4c5c,
+      0xeda2ee1c7064130c, 0x1162def06f79df73,
+      0x9485d4d1c63e8be7, 0x8addcb5645ac2ba8,
+      0xb9a74a0637ce2ee1, 0x6d953e2bd7173692,
+      0xe8111c87c5c1ba99, 0xc8fa8db6ccdd0437,
+      0x910ab1d4db9914a0, 0x1d9c9892400a22a2,
+      0xb54d5e4a127f59c8, 0x2503beb6d00cab4b,
+      0xe2a0b5dc971f303a, 0x2e44ae64840fd61d,
+      0x8da471a9de737e24, 0x5ceaecfed289e5d2,
+      0xb10d8e1456105dad, 0x7425a83e872c5f47,
+      0xdd50f1996b947518, 0xd12f124e28f77719,
+      0x8a5296ffe33cc92f, 0x82bd6b70d99aaa6f,
+      0xace73cbfdc0bfb7b, 0x636cc64d1001550b,
+      0xd8210befd30efa5a, 0x3c47f7e05401aa4e,
+      0x8714a775e3e95c78, 0x65acfaec34810a71,
+      0xa8d9d1535ce3b396, 0x7f1839a741a14d0d,
+      0xd31045a8341ca07c, 0x1ede48111209a050,
+      0x83ea2b892091e44d, 0x934aed0aab460432,
+      0xa4e4b66b68b65d60, 0xf81da84d5617853f,
+      0xce1de40642e3f4b9, 0x36251260ab9d668e,
+      0x80d2ae83e9ce78f3, 0xc1d72b7c6b426019,
+      0xa1075a24e4421730, 0xb24cf65b8612f81f,
+      0xc94930ae1d529cfc, 0xdee033f26797b627,
+      0xfb9b7cd9a4a7443c, 0x169840ef017da3b1,
+      0x9d412e0806e88aa5, 0x8e1f289560ee864e,
+      0xc491798a08a2ad4e, 0xf1a6f2bab92a27e2,
+      0xf5b5d7ec8acb58a2, 0xae10af696774b1db,
+      0x9991a6f3d6bf1765, 0xacca6da1e0a8ef29,
+      0xbff610b0cc6edd3f, 0x17fd090a58d32af3,
+      0xeff394dcff8a948e, 0xddfc4b4cef07f5b0,
+      0x95f83d0a1fb69cd9, 0x4abdaf101564f98e,
+      0xbb764c4ca7a4440f, 0x9d6d1ad41abe37f1,
+      0xea53df5fd18d5513, 0x84c86189216dc5ed,
+      0x92746b9be2f8552c, 0x32fd3cf5b4e49bb4,
+      0xb7118682dbb66a77, 0x3fbc8c33221dc2a1,
+      0xe4d5e82392a40515, 0xfabaf3feaa5334a,
+      0x8f05b1163ba6832d, 0x29cb4d87f2a7400e,
+      0xb2c71d5bca9023f8, 0x743e20e9ef511012,
+      0xdf78e4b2bd342cf6, 0x914da9246b255416,
+      0x8bab8eefb6409c1a, 0x1ad089b6c2f7548e,
+      0xae9672aba3d0c320, 0xa184ac2473b529b1,
+      0xda3c0f568cc4f3e8, 0xc9e5d72d90a2741e,
+      0x8865899617fb1871, 0x7e2fa67c7a658892,
+      0xaa7eebfb9df9de8d, 0xddbb901b98feeab7,
+      0xd51ea6fa85785631, 0x552a74227f3ea565,
+      0x8533285c936b35de, 0xd53a88958f87275f,
+      0xa67ff273b8460356, 0x8a892abaf368f137,
+      0xd01fef10a657842c, 0x2d2b7569b0432d85,
+      0x8213f56a67f6b29b, 0x9c3b29620e29fc73,
+      0xa298f2c501f45f42, 0x8349f3ba91b47b8f,
+      0xcb3f2f7642717713, 0x241c70a936219a73,
+      0xfe0efb53d30dd4d7, 0xed238cd383aa0110,
+      0x9ec95d1463e8a506, 0xf4363804324a40aa,
+      0xc67bb4597ce2ce48, 0xb143c6053edcd0d5,
+      0xf81aa16fdc1b81da, 0xdd94b7868e94050a,
+      0x9b10a4e5e9913128, 0xca7cf2b4191c8326,
+      0xc1d4ce1f63f57d72, 0xfd1c2f611f63a3f0,
+      0xf24a01a73cf2dccf, 0xbc633b39673c8cec,
+      0x976e41088617ca01, 0xd5be0503e085d813,
+      0xbd49d14aa79dbc82, 0x4b2d8644d8a74e18,
+      0xec9c459d51852ba2, 0xddf8e7d60ed1219e,
+      0x93e1ab8252f33b45, 0xcabb90e5c942b503,
+      0xb8da1662e7b00a17, 0x3d6a751f3b936243,
+      0xe7109bfba19c0c9d, 0xcc512670a783ad4,
+      0x906a617d450187e2, 0x27fb2b80668b24c5,
+      0xb484f9dc9641e9da, 0xb1f9f660802dedf6,
+      0xe1a63853bbd26451, 0x5e7873f8a0396973,
+      0x8d07e33455637eb2, 0xdb0b487b6423e1e8,
+      0xb049dc016abc5e5f, 0x91ce1a9a3d2cda62,
+      0xdc5c5301c56b75f7, 0x7641a140cc7810fb,
+      0x89b9b3e11b6329ba, 0xa9e904c87fcb0a9d,
+      0xac2820d9623bf429, 0x546345fa9fbdcd44,
+      0xd732290fbacaf133, 0xa97c177947ad4095,
+      0x867f59a9d4bed6c0, 0x49ed8eabcccc485d,
+      0xa81f301449ee8c70, 0x5c68f256bfff5a74,
+      0xd226fc195c6a2f8c, 0x73832eec6fff3111,
+      0x83585d8fd9c25db7, 0xc831fd53c5ff7eab,
+      0xa42e74f3d032f525, 0xba3e7ca8b77f5e55,
+      0xcd3a1230c43fb26f, 0x28ce1bd2e55f35eb,
+      0x80444b5e7aa7cf85, 0x7980d163cf5b81b3,
+      0xa0555e361951c366, 0xd7e105bcc332621f,
+      0xc86ab5c39fa63440, 0x8dd9472bf3fefaa7,
+      0xfa856334878fc150, 0xb14f98f6f0feb951,
+      0x9c935e00d4b9d8d2, 0x6ed1bf9a569f33d3,
+      0xc3b8358109e84f07, 0xa862f80ec4700c8,
+      0xf4a642e14c6262c8, 0xcd27bb612758c0fa,
+      0x98e7e9cccfbd7dbd, 0x8038d51cb897789c,
+      0xbf21e44003acdd2c, 0xe0470a63e6bd56c3,
+      0xeeea5d5004981478, 0x1858ccfce06cac74,
+      0x95527a5202df0ccb, 0xf37801e0c43ebc8,
+      0xbaa718e68396cffd, 0xd30560258f54e6ba,
+      0xe950df20247c83fd, 0x47c6b82ef32a2069,
+      0x91d28b7416cdd27e, 0x4cdc331d57fa5441,
+      0xb6472e511c81471d, 0xe0133fe4adf8e952,
+      0xe3d8f9e563a198e5, 0x58180fddd97723a6,
+      0x8e679c2f5e44ff8f, 0x570f09eaa7ea7648,
+  };
 };
 
+#if FASTFLOAT_DETAIL_MUST_DEFINE_CONSTEXPR_VARIABLE
+
 template <class unused>
-constexpr uint64_t powers_template<unused>::power_of_five_128[number_of_entries];
+constexpr uint64_t
+    powers_template<unused>::power_of_five_128[number_of_entries];
+
+#endif
 
 using powers = powers_template<>;
 
diff --git a/third_party/fast_float/float_common.h b/third_party/fast_float/float_common.h
index bee882152025..edc163cb472e 100644
--- a/third_party/fast_float/float_common.h
+++ b/third_party/fast_float/float_common.h
@@ -7,7 +7,11 @@
 #include <cstring>
 #include <type_traits>
 #include <system_error>
-
+#ifdef __has_include
+#if __has_include(<stdfloat>) && (__cplusplus > 202002L || _MSVC_LANG > 202002L)
+#include <stdfloat>
+#endif
+#endif
 #include "constexpr_feature_detect.h"
 
 namespace fast_float {
@@ -28,18 +32,16 @@ enum chars_format {
   general = fixed | scientific
 };
 
-template <typename UC>
-struct from_chars_result_t {
-  UC const* ptr;
+template <typename UC> struct from_chars_result_t {
+  UC const *ptr;
   std::errc ec;
 };
 using from_chars_result = from_chars_result_t<char>;
 
-template <typename UC>
-struct parse_options_t {
+template <typename UC> struct parse_options_t {
   constexpr explicit parse_options_t(chars_format fmt = chars_format::general,
-    UC dot = UC('.'))
-    : format(fmt), decimal_point(dot) {}
+                                     UC dot = UC('.'))
+      : format(fmt), decimal_point(dot) {}
 
   /** Which number formats are accepted */
   chars_format format;
@@ -48,39 +50,41 @@ struct parse_options_t {
 };
 using parse_options = parse_options_t<char>;
 
-}
+} // namespace fast_float
 
 #if FASTFLOAT_HAS_BIT_CAST
 #include <bit>
 #endif
 
-#if (defined(__x86_64) || defined(__x86_64__) || defined(_M_X64)   \
-       || defined(__amd64) || defined(__aarch64__) || defined(_M_ARM64) \
-       || defined(__MINGW64__)                                          \
-       || defined(__s390x__)                                            \
-       || (defined(__ppc64__) || defined(__PPC64__) || defined(__ppc64le__) || defined(__PPC64LE__)) \
-       || defined(__loongarch64) )
+#if (defined(__x86_64) || defined(__x86_64__) || defined(_M_X64) ||            \
+     defined(__amd64) || defined(__aarch64__) || defined(_M_ARM64) ||          \
+     defined(__MINGW64__) || defined(__s390x__) ||                             \
+     (defined(__ppc64__) || defined(__PPC64__) || defined(__ppc64le__) ||      \
+      defined(__PPC64LE__)) ||                                                 \
+     defined(__loongarch64))
 #define FASTFLOAT_64BIT 1
-#elif (defined(__i386) || defined(__i386__) || defined(_M_IX86)   \
-     || defined(__arm__) || defined(_M_ARM) || defined(__ppc__)   \
-     || defined(__MINGW32__) || defined(__EMSCRIPTEN__))
+#elif (defined(__i386) || defined(__i386__) || defined(_M_IX86) ||             \
+       defined(__arm__) || defined(_M_ARM) || defined(__ppc__) ||              \
+       defined(__MINGW32__) || defined(__EMSCRIPTEN__))
 #define FASTFLOAT_32BIT 1
 #else
   // Need to check incrementally, since SIZE_MAX is a size_t, avoid overflow.
-  // We can never tell the register width, but the SIZE_MAX is a good approximation.
-  // UINTPTR_MAX and INTPTR_MAX are optional, so avoid them for max portability.
-  #if SIZE_MAX == 0xffff
-    #error Unknown platform (16-bit, unsupported)
-  #elif SIZE_MAX == 0xffffffff
-    #define FASTFLOAT_32BIT 1
-  #elif SIZE_MAX == 0xffffffffffffffff
-    #define FASTFLOAT_64BIT 1
-  #else
-    #error Unknown platform (not 32-bit, not 64-bit?)
-  #endif
-#endif
-
-#if ((defined(_WIN32) || defined(_WIN64)) && !defined(__clang__))
+// We can never tell the register width, but the SIZE_MAX is a good
+// approximation. UINTPTR_MAX and INTPTR_MAX are optional, so avoid them for max
+// portability.
+#if SIZE_MAX == 0xffff
+#error Unknown platform (16-bit, unsupported)
+#elif SIZE_MAX == 0xffffffff
+#define FASTFLOAT_32BIT 1
+#elif SIZE_MAX == 0xffffffffffffffff
+#define FASTFLOAT_64BIT 1
+#else
+#error Unknown platform (not 32-bit, not 64-bit?)
+#endif
+#endif
+
+#if ((defined(_WIN32) || defined(_WIN64)) && !defined(__clang__)) ||           \
+    (defined(_M_ARM64) && !defined(__MINGW32__))
 #include <intrin.h>
 #endif
 
@@ -124,9 +128,9 @@ using parse_options = parse_options_t<char>;
 #endif
 #endif
 
-#if defined(__SSE2__) || \
-  (defined(FASTFLOAT_VISUAL_STUDIO) && \
-    (defined(_M_AMD64) || defined(_M_X64) || (defined(_M_IX86_FP) && _M_IX86_FP == 2)))
+#if defined(__SSE2__) || (defined(FASTFLOAT_VISUAL_STUDIO) &&                  \
+                          (defined(_M_AMD64) || defined(_M_X64) ||             \
+                           (defined(_M_IX86_FP) && _M_IX86_FP == 2)))
 #define FASTFLOAT_SSE2 1
 #endif
 
@@ -134,28 +138,25 @@ using parse_options = parse_options_t<char>;
 #define FASTFLOAT_NEON 1
 #endif
 
-#if defined(FASTFLOAT_SSE2) || defined(FASTFLOAT_ARM64)
+#if defined(FASTFLOAT_SSE2) || defined(FASTFLOAT_NEON)
 #define FASTFLOAT_HAS_SIMD 1
 #endif
 
 #if defined(__GNUC__)
 // disable -Wcast-align=strict (GCC only)
-#define FASTFLOAT_SIMD_DISABLE_WARNINGS \
-  _Pragma("GCC diagnostic push") \
-  _Pragma("GCC diagnostic ignored \"-Wcast-align\"")
+#define FASTFLOAT_SIMD_DISABLE_WARNINGS                                        \
+  _Pragma("GCC diagnostic push")                                               \
+      _Pragma("GCC diagnostic ignored \"-Wcast-align\"")
 #else
 #define FASTFLOAT_SIMD_DISABLE_WARNINGS
 #endif
 
 #if defined(__GNUC__)
-#define FASTFLOAT_SIMD_RESTORE_WARNINGS \
-  _Pragma("GCC diagnostic pop")
+#define FASTFLOAT_SIMD_RESTORE_WARNINGS _Pragma("GCC diagnostic pop")
 #else
 #define FASTFLOAT_SIMD_RESTORE_WARNINGS
 #endif
 
-
-
 #ifdef FASTFLOAT_VISUAL_STUDIO
 #define fastfloat_really_inline __forceinline
 #else
@@ -163,18 +164,24 @@ using parse_options = parse_options_t<char>;
 #endif
 
 #ifndef FASTFLOAT_ASSERT
-#define FASTFLOAT_ASSERT(x)  { ((void)(x)); }
+#define FASTFLOAT_ASSERT(x)                                                    \
+  { ((void)(x)); }
 #endif
 
 #ifndef FASTFLOAT_DEBUG_ASSERT
-#define FASTFLOAT_DEBUG_ASSERT(x) { ((void)(x)); }
+#define FASTFLOAT_DEBUG_ASSERT(x)                                              \
+  { ((void)(x)); }
 #endif
 
 // rust style `try!()` macro, or `?` operator
-#define FASTFLOAT_TRY(x) { if (!(x)) return false; }
-
-#define FASTFLOAT_ENABLE_IF(...) typename std::enable_if<(__VA_ARGS__), int>::type = 0
+#define FASTFLOAT_TRY(x)                                                       \
+  {                                                                            \
+    if (!(x))                                                                  \
+      return false;                                                            \
+  }
 
+#define FASTFLOAT_ENABLE_IF(...)                                               \
+  typename std::enable_if<(__VA_ARGS__), int>::type
 
 namespace fast_float {
 
@@ -186,10 +193,28 @@ fastfloat_really_inline constexpr bool cpp20_and_in_constexpr() {
 #endif
 }
 
+template <typename T>
+fastfloat_really_inline constexpr bool is_supported_float_type() {
+  return std::is_same<T, float>::value || std::is_same<T, double>::value
+#if __STDCPP_FLOAT32_T__
+         || std::is_same<T, std::float32_t>::value
+#endif
+#if __STDCPP_FLOAT64_T__
+         || std::is_same<T, std::float64_t>::value
+#endif
+      ;
+}
+
+template <typename UC>
+fastfloat_really_inline constexpr bool is_supported_char_type() {
+  return std::is_same<UC, char>::value || std::is_same<UC, wchar_t>::value ||
+         std::is_same<UC, char16_t>::value || std::is_same<UC, char32_t>::value;
+}
+
 // Compares two ASCII strings in a case insensitive manner.
 template <typename UC>
 inline FASTFLOAT_CONSTEXPR14 bool
-fastfloat_strncasecmp(UC const * input1, UC const * input2, size_t length) {
+fastfloat_strncasecmp(UC const *input1, UC const *input2, size_t length) {
   char running_diff{0};
   for (size_t i = 0; i < length; ++i) {
     running_diff |= (char(input1[i]) ^ char(input2[i]));
@@ -202,18 +227,15 @@ fastfloat_strncasecmp(UC const * input1, UC const * input2, size_t length) {
 #endif
 
 // a pointer and a length to a contiguous block of memory
-template <typename T>
-struct span {
-  const T* ptr;
+template <typename T> struct span {
+  const T *ptr;
   size_t length;
-  constexpr span(const T* _ptr, size_t _length) : ptr(_ptr), length(_length) {}
+  constexpr span(const T *_ptr, size_t _length) : ptr(_ptr), length(_length) {}
   constexpr span() : ptr(nullptr), length(0) {}
 
-  constexpr size_t len() const noexcept {
-    return length;
-  }
+  constexpr size_t len() const noexcept { return length; }
 
-  FASTFLOAT_CONSTEXPR14 const T& operator[](size_t index) const noexcept {
+  FASTFLOAT_CONSTEXPR14 const T &operator[](size_t index) const noexcept {
     FASTFLOAT_DEBUG_ASSERT(index < length);
     return ptr[index];
   }
@@ -227,34 +249,51 @@ struct value128 {
 };
 
 /* Helper C++14 constexpr generic implementation of leading_zeroes */
-fastfloat_really_inline FASTFLOAT_CONSTEXPR14
-int leading_zeroes_generic(uint64_t input_num, int last_bit = 0) {
-    if(input_num & uint64_t(0xffffffff00000000)) { input_num >>= 32; last_bit |= 32; }
-    if(input_num & uint64_t(        0xffff0000)) { input_num >>= 16; last_bit |= 16; }
-    if(input_num & uint64_t(            0xff00)) { input_num >>=  8; last_bit |=  8; }
-    if(input_num & uint64_t(              0xf0)) { input_num >>=  4; last_bit |=  4; }
-    if(input_num & uint64_t(               0xc)) { input_num >>=  2; last_bit |=  2; }
-    if(input_num & uint64_t(               0x2)) { input_num >>=  1; last_bit |=  1; }
-    return 63 - last_bit;
+fastfloat_really_inline FASTFLOAT_CONSTEXPR14 int
+leading_zeroes_generic(uint64_t input_num, int last_bit = 0) {
+  if (input_num & uint64_t(0xffffffff00000000)) {
+    input_num >>= 32;
+    last_bit |= 32;
+  }
+  if (input_num & uint64_t(0xffff0000)) {
+    input_num >>= 16;
+    last_bit |= 16;
+  }
+  if (input_num & uint64_t(0xff00)) {
+    input_num >>= 8;
+    last_bit |= 8;
+  }
+  if (input_num & uint64_t(0xf0)) {
+    input_num >>= 4;
+    last_bit |= 4;
+  }
+  if (input_num & uint64_t(0xc)) {
+    input_num >>= 2;
+    last_bit |= 2;
+  }
+  if (input_num & uint64_t(0x2)) { /* input_num >>=  1; */
+    last_bit |= 1;
+  }
+  return 63 - last_bit;
 }
 
 /* result might be undefined when input_num is zero */
-fastfloat_really_inline FASTFLOAT_CONSTEXPR20
-int leading_zeroes(uint64_t input_num) {
+fastfloat_really_inline FASTFLOAT_CONSTEXPR20 int
+leading_zeroes(uint64_t input_num) {
   assert(input_num > 0);
   if (cpp20_and_in_constexpr()) {
     return leading_zeroes_generic(input_num);
   }
 #ifdef FASTFLOAT_VISUAL_STUDIO
-  #if defined(_M_X64) || defined(_M_ARM64)
+#if defined(_M_X64) || defined(_M_ARM64)
   unsigned long leading_zero = 0;
   // Search the mask data from most significant bit (MSB)
   // to least significant bit (LSB) for a set bit (1).
   _BitScanReverse64(&leading_zero, input_num);
   return (int)(63 - leading_zero);
-  #else
+#else
   return leading_zeroes_generic(input_num);
-  #endif
+#endif
 #else
   return __builtin_clzll(input_num);
 #endif
@@ -262,18 +301,18 @@ int leading_zeroes(uint64_t input_num) {
 
 // slow emulation routine for 32-bit
 fastfloat_really_inline constexpr uint64_t emulu(uint32_t x, uint32_t y) {
-    return x * (uint64_t)y;
+  return x * (uint64_t)y;
 }
 
-fastfloat_really_inline FASTFLOAT_CONSTEXPR14
-uint64_t umul128_generic(uint64_t ab, uint64_t cd, uint64_t *hi) {
+fastfloat_really_inline FASTFLOAT_CONSTEXPR14 uint64_t
+umul128_generic(uint64_t ab, uint64_t cd, uint64_t *hi) {
   uint64_t ad = emulu((uint32_t)(ab >> 32), (uint32_t)cd);
   uint64_t bd = emulu((uint32_t)ab, (uint32_t)cd);
   uint64_t adbc = ad + emulu((uint32_t)ab, (uint32_t)(cd >> 32));
-  uint64_t adbc_carry = !!(adbc < ad);
+  uint64_t adbc_carry = (uint64_t)(adbc < ad);
   uint64_t lo = bd + (adbc << 32);
   *hi = emulu((uint32_t)(ab >> 32), (uint32_t)(cd >> 32)) + (adbc >> 32) +
-        (adbc_carry << 32) + !!(lo < bd);
+        (adbc_carry << 32) + (uint64_t)(lo < bd);
   return lo;
 }
 
@@ -281,18 +320,18 @@ uint64_t umul128_generic(uint64_t ab, uint64_t cd, uint64_t *hi) {
 
 // slow emulation routine for 32-bit
 #if !defined(__MINGW64__)
-fastfloat_really_inline FASTFLOAT_CONSTEXPR14
-uint64_t _umul128(uint64_t ab, uint64_t cd, uint64_t *hi) {
+fastfloat_really_inline FASTFLOAT_CONSTEXPR14 uint64_t _umul128(uint64_t ab,
+                                                                uint64_t cd,
+                                                                uint64_t *hi) {
   return umul128_generic(ab, cd, hi);
 }
 #endif // !__MINGW64__
 
 #endif // FASTFLOAT_32BIT
 
-
 // compute 64-bit a*b
-fastfloat_really_inline FASTFLOAT_CONSTEXPR20
-value128 full_multiplication(uint64_t a, uint64_t b) {
+fastfloat_really_inline FASTFLOAT_CONSTEXPR20 value128
+full_multiplication(uint64_t a, uint64_t b) {
   if (cpp20_and_in_constexpr()) {
     value128 answer;
     answer.low = umul128_generic(a, b, &answer.high);
@@ -304,9 +343,10 @@ value128 full_multiplication(uint64_t a, uint64_t b) {
   // But MinGW on ARM64 doesn't have native support for 64-bit multiplications
   answer.high = __umulh(a, b);
   answer.low = a * b;
-#elif defined(FASTFLOAT_32BIT) || (defined(_WIN64) && !defined(__clang__))
+#elif defined(FASTFLOAT_32BIT) ||                                              \
+    (defined(_WIN64) && !defined(__clang__) && !defined(_M_ARM64))
   answer.low = _umul128(a, b, &answer.high); // _umul128 not available on ARM64
-#elif defined(FASTFLOAT_64BIT)
+#elif defined(FASTFLOAT_64BIT) && defined(__SIZEOF_INT128__)
   __uint128_t r = ((__uint128_t)a) * b;
   answer.low = uint64_t(r);
   answer.high = uint64_t(r >> 64);
@@ -334,22 +374,24 @@ constexpr static int32_t invalid_am_bias = -0x8000;
 // used for binary_format_lookup_tables<T>::max_mantissa
 constexpr uint64_t constant_55555 = 5 * 5 * 5 * 5 * 5;
 
-template <typename T, typename U = void>
-struct binary_format_lookup_tables;
+template <typename T, typename U = void> struct binary_format_lookup_tables;
 
 template <typename T> struct binary_format : binary_format_lookup_tables<T> {
-  using equiv_uint = typename std::conditional<sizeof(T) == 4, uint32_t, uint64_t>::type;
+  using equiv_uint =
+      typename std::conditional<sizeof(T) == 4, uint32_t, uint64_t>::type;
 
   static inline constexpr int mantissa_explicit_bits();
   static inline constexpr int minimum_exponent();
   static inline constexpr int infinite_power();
   static inline constexpr int sign_index();
-  static inline constexpr int min_exponent_fast_path(); // used when fegetround() == FE_TONEAREST
+  static inline constexpr int
+  min_exponent_fast_path(); // used when fegetround() == FE_TONEAREST
   static inline constexpr int max_exponent_fast_path();
   static inline constexpr int max_exponent_round_to_even();
   static inline constexpr int min_exponent_round_to_even();
   static inline constexpr uint64_t max_mantissa_fast_path(int64_t power);
-  static inline constexpr uint64_t max_mantissa_fast_path(); // used when fegetround() == FE_TONEAREST
+  static inline constexpr uint64_t
+  max_mantissa_fast_path(); // used when fegetround() == FE_TONEAREST
   static inline constexpr int largest_power_of_ten();
   static inline constexpr int smallest_power_of_ten();
   static inline constexpr T exact_power_of_ten(int64_t power);
@@ -359,76 +401,91 @@ template <typename T> struct binary_format : binary_format_lookup_tables<T> {
   static inline constexpr equiv_uint hidden_bit_mask();
 };
 
-template <typename U>
-struct binary_format_lookup_tables<double, U> {
+template <typename U> struct binary_format_lookup_tables<double, U> {
   static constexpr double powers_of_ten[] = {
       1e0,  1e1,  1e2,  1e3,  1e4,  1e5,  1e6,  1e7,  1e8,  1e9,  1e10, 1e11,
       1e12, 1e13, 1e14, 1e15, 1e16, 1e17, 1e18, 1e19, 1e20, 1e21, 1e22};
 
   // Largest integer value v so that (5**index * v) <= 1<<53.
-  // 0x10000000000000 == 1 << 53
+  // 0x20000000000000 == 1 << 53
   static constexpr uint64_t max_mantissa[] = {
-      0x10000000000000,
-      0x10000000000000 / 5,
-      0x10000000000000 / (5 * 5),
-      0x10000000000000 / (5 * 5 * 5),
-      0x10000000000000 / (5 * 5 * 5 * 5),
-      0x10000000000000 / (constant_55555),
-      0x10000000000000 / (constant_55555 * 5),
-      0x10000000000000 / (constant_55555 * 5 * 5),
-      0x10000000000000 / (constant_55555 * 5 * 5 * 5),
-      0x10000000000000 / (constant_55555 * 5 * 5 * 5 * 5),
-      0x10000000000000 / (constant_55555 * constant_55555),
-      0x10000000000000 / (constant_55555 * constant_55555 * 5),
-      0x10000000000000 / (constant_55555 * constant_55555 * 5 * 5),
-      0x10000000000000 / (constant_55555 * constant_55555 * 5 * 5 * 5),
-      0x10000000000000 / (constant_55555 * constant_55555 * constant_55555),
-      0x10000000000000 / (constant_55555 * constant_55555 * constant_55555 * 5),
-      0x10000000000000 / (constant_55555 * constant_55555 * constant_55555 * 5 * 5),
-      0x10000000000000 / (constant_55555 * constant_55555 * constant_55555 * 5 * 5 * 5),
-      0x10000000000000 / (constant_55555 * constant_55555 * constant_55555 * 5 * 5 * 5 * 5),
-      0x10000000000000 / (constant_55555 * constant_55555 * constant_55555 * constant_55555),
-      0x10000000000000 / (constant_55555 * constant_55555 * constant_55555 * constant_55555 * 5),
-      0x10000000000000 / (constant_55555 * constant_55555 * constant_55555 * constant_55555 * 5 * 5),
-      0x10000000000000 / (constant_55555 * constant_55555 * constant_55555 * constant_55555 * 5 * 5 * 5),
-      0x10000000000000 / (constant_55555 * constant_55555 * constant_55555 * constant_55555 * 5 * 5 * 5 * 5)};
+      0x20000000000000,
+      0x20000000000000 / 5,
+      0x20000000000000 / (5 * 5),
+      0x20000000000000 / (5 * 5 * 5),
+      0x20000000000000 / (5 * 5 * 5 * 5),
+      0x20000000000000 / (constant_55555),
+      0x20000000000000 / (constant_55555 * 5),
+      0x20000000000000 / (constant_55555 * 5 * 5),
+      0x20000000000000 / (constant_55555 * 5 * 5 * 5),
+      0x20000000000000 / (constant_55555 * 5 * 5 * 5 * 5),
+      0x20000000000000 / (constant_55555 * constant_55555),
+      0x20000000000000 / (constant_55555 * constant_55555 * 5),
+      0x20000000000000 / (constant_55555 * constant_55555 * 5 * 5),
+      0x20000000000000 / (constant_55555 * constant_55555 * 5 * 5 * 5),
+      0x20000000000000 / (constant_55555 * constant_55555 * constant_55555),
+      0x20000000000000 / (constant_55555 * constant_55555 * constant_55555 * 5),
+      0x20000000000000 /
+          (constant_55555 * constant_55555 * constant_55555 * 5 * 5),
+      0x20000000000000 /
+          (constant_55555 * constant_55555 * constant_55555 * 5 * 5 * 5),
+      0x20000000000000 /
+          (constant_55555 * constant_55555 * constant_55555 * 5 * 5 * 5 * 5),
+      0x20000000000000 /
+          (constant_55555 * constant_55555 * constant_55555 * constant_55555),
+      0x20000000000000 / (constant_55555 * constant_55555 * constant_55555 *
+                          constant_55555 * 5),
+      0x20000000000000 / (constant_55555 * constant_55555 * constant_55555 *
+                          constant_55555 * 5 * 5),
+      0x20000000000000 / (constant_55555 * constant_55555 * constant_55555 *
+                          constant_55555 * 5 * 5 * 5),
+      0x20000000000000 / (constant_55555 * constant_55555 * constant_55555 *
+                          constant_55555 * 5 * 5 * 5 * 5)};
 };
 
+#if FASTFLOAT_DETAIL_MUST_DEFINE_CONSTEXPR_VARIABLE
+
 template <typename U>
 constexpr double binary_format_lookup_tables<double, U>::powers_of_ten[];
 
 template <typename U>
 constexpr uint64_t binary_format_lookup_tables<double, U>::max_mantissa[];
 
-template <typename U>
-struct binary_format_lookup_tables<float, U> {
+#endif
+
+template <typename U> struct binary_format_lookup_tables<float, U> {
   static constexpr float powers_of_ten[] = {1e0f, 1e1f, 1e2f, 1e3f, 1e4f, 1e5f,
-                                     1e6f, 1e7f, 1e8f, 1e9f, 1e10f};
+                                            1e6f, 1e7f, 1e8f, 1e9f, 1e10f};
 
   // Largest integer value v so that (5**index * v) <= 1<<24.
   // 0x1000000 == 1<<24
   static constexpr uint64_t max_mantissa[] = {
-        0x1000000,
-        0x1000000 / 5,
-        0x1000000 / (5 * 5),
-        0x1000000 / (5 * 5 * 5),
-        0x1000000 / (5 * 5 * 5 * 5),
-        0x1000000 / (constant_55555),
-        0x1000000 / (constant_55555 * 5),
-        0x1000000 / (constant_55555 * 5 * 5),
-        0x1000000 / (constant_55555 * 5 * 5 * 5),
-        0x1000000 / (constant_55555 * 5 * 5 * 5 * 5),
-        0x1000000 / (constant_55555 * constant_55555),
-        0x1000000 / (constant_55555 * constant_55555 * 5)};
+      0x1000000,
+      0x1000000 / 5,
+      0x1000000 / (5 * 5),
+      0x1000000 / (5 * 5 * 5),
+      0x1000000 / (5 * 5 * 5 * 5),
+      0x1000000 / (constant_55555),
+      0x1000000 / (constant_55555 * 5),
+      0x1000000 / (constant_55555 * 5 * 5),
+      0x1000000 / (constant_55555 * 5 * 5 * 5),
+      0x1000000 / (constant_55555 * 5 * 5 * 5 * 5),
+      0x1000000 / (constant_55555 * constant_55555),
+      0x1000000 / (constant_55555 * constant_55555 * 5)};
 };
 
+#if FASTFLOAT_DETAIL_MUST_DEFINE_CONSTEXPR_VARIABLE
+
 template <typename U>
 constexpr float binary_format_lookup_tables<float, U>::powers_of_ten[];
 
 template <typename U>
 constexpr uint64_t binary_format_lookup_tables<float, U>::max_mantissa[];
 
-template <> inline constexpr int binary_format<double>::min_exponent_fast_path() {
+#endif
+
+template <>
+inline constexpr int binary_format<double>::min_exponent_fast_path() {
 #if (FLT_EVAL_METHOD != 1) && (FLT_EVAL_METHOD != 0)
   return 0;
 #else
@@ -436,7 +493,8 @@ template <> inline constexpr int binary_format<double>::min_exponent_fast_path()
 #endif
 }
 
-template <> inline constexpr int binary_format<float>::min_exponent_fast_path() {
+template <>
+inline constexpr int binary_format<float>::min_exponent_fast_path() {
 #if (FLT_EVAL_METHOD != 1) && (FLT_EVAL_METHOD != 0)
   return 0;
 #else
@@ -444,26 +502,32 @@ template <> inline constexpr int binary_format<float>::min_exponent_fast_path()
 #endif
 }
 
-template <> inline constexpr int binary_format<double>::mantissa_explicit_bits() {
+template <>
+inline constexpr int binary_format<double>::mantissa_explicit_bits() {
   return 52;
 }
-template <> inline constexpr int binary_format<float>::mantissa_explicit_bits() {
+template <>
+inline constexpr int binary_format<float>::mantissa_explicit_bits() {
   return 23;
 }
 
-template <> inline constexpr int binary_format<double>::max_exponent_round_to_even() {
+template <>
+inline constexpr int binary_format<double>::max_exponent_round_to_even() {
   return 23;
 }
 
-template <> inline constexpr int binary_format<float>::max_exponent_round_to_even() {
+template <>
+inline constexpr int binary_format<float>::max_exponent_round_to_even() {
   return 10;
 }
 
-template <> inline constexpr int binary_format<double>::min_exponent_round_to_even() {
+template <>
+inline constexpr int binary_format<double>::min_exponent_round_to_even() {
   return -4;
 }
 
-template <> inline constexpr int binary_format<float>::min_exponent_round_to_even() {
+template <>
+inline constexpr int binary_format<float>::min_exponent_round_to_even() {
   return -17;
 }
 
@@ -481,30 +545,42 @@ template <> inline constexpr int binary_format<float>::infinite_power() {
   return 0xFF;
 }
 
-template <> inline constexpr int binary_format<double>::sign_index() { return 63; }
-template <> inline constexpr int binary_format<float>::sign_index() { return 31; }
+template <> inline constexpr int binary_format<double>::sign_index() {
+  return 63;
+}
+template <> inline constexpr int binary_format<float>::sign_index() {
+  return 31;
+}
 
-template <> inline constexpr int binary_format<double>::max_exponent_fast_path() {
+template <>
+inline constexpr int binary_format<double>::max_exponent_fast_path() {
   return 22;
 }
-template <> inline constexpr int binary_format<float>::max_exponent_fast_path() {
+template <>
+inline constexpr int binary_format<float>::max_exponent_fast_path() {
   return 10;
 }
 
-template <> inline constexpr uint64_t binary_format<double>::max_mantissa_fast_path() {
+template <>
+inline constexpr uint64_t binary_format<double>::max_mantissa_fast_path() {
   return uint64_t(2) << mantissa_explicit_bits();
 }
-template <> inline constexpr uint64_t binary_format<double>::max_mantissa_fast_path(int64_t power) {
+template <>
+inline constexpr uint64_t
+binary_format<double>::max_mantissa_fast_path(int64_t power) {
   // caller is responsible to ensure that
   // power >= 0 && power <= 22
   //
   // Work around clang bug https://godbolt.org/z/zedh7rrhc
   return (void)max_mantissa[0], max_mantissa[power];
 }
-template <> inline constexpr uint64_t binary_format<float>::max_mantissa_fast_path() {
+template <>
+inline constexpr uint64_t binary_format<float>::max_mantissa_fast_path() {
   return uint64_t(2) << mantissa_explicit_bits();
 }
-template <> inline constexpr uint64_t binary_format<float>::max_mantissa_fast_path(int64_t power) {
+template <>
+inline constexpr uint64_t
+binary_format<float>::max_mantissa_fast_path(int64_t power) {
   // caller is responsible to ensure that
   // power >= 0 && power <= 10
   //
@@ -513,7 +589,8 @@ template <> inline constexpr uint64_t binary_format<float>::max_mantissa_fast_pa
 }
 
 template <>
-inline constexpr double binary_format<double>::exact_power_of_ten(int64_t power) {
+inline constexpr double
+binary_format<double>::exact_power_of_ten(int64_t power) {
   // Work around clang bug https://godbolt.org/z/zedh7rrhc
   return (void)powers_of_ten[0], powers_of_ten[power];
 }
@@ -523,13 +600,10 @@ inline constexpr float binary_format<float>::exact_power_of_ten(int64_t power) {
   return (void)powers_of_ten[0], powers_of_ten[power];
 }
 
-
-template <>
-inline constexpr int binary_format<double>::largest_power_of_ten() {
+template <> inline constexpr int binary_format<double>::largest_power_of_ten() {
   return 308;
 }
-template <>
-inline constexpr int binary_format<float>::largest_power_of_ten() {
+template <> inline constexpr int binary_format<float>::largest_power_of_ten() {
   return 38;
 }
 
@@ -537,9 +611,8 @@ template <>
 inline constexpr int binary_format<double>::smallest_power_of_ten() {
   return -342;
 }
-template <>
-inline constexpr int binary_format<float>::smallest_power_of_ten() {
-  return -65;
+template <> inline constexpr int binary_format<float>::smallest_power_of_ten() {
+  return -64;
 }
 
 template <> inline constexpr size_t binary_format<double>::max_digits() {
@@ -549,39 +622,46 @@ template <> inline constexpr size_t binary_format<float>::max_digits() {
   return 114;
 }
 
-template <> inline constexpr binary_format<float>::equiv_uint
-    binary_format<float>::exponent_mask() {
+template <>
+inline constexpr binary_format<float>::equiv_uint
+binary_format<float>::exponent_mask() {
   return 0x7F800000;
 }
-template <> inline constexpr binary_format<double>::equiv_uint
-    binary_format<double>::exponent_mask() {
+template <>
+inline constexpr binary_format<double>::equiv_uint
+binary_format<double>::exponent_mask() {
   return 0x7FF0000000000000;
 }
 
-template <> inline constexpr binary_format<float>::equiv_uint
-    binary_format<float>::mantissa_mask() {
+template <>
+inline constexpr binary_format<float>::equiv_uint
+binary_format<float>::mantissa_mask() {
   return 0x007FFFFF;
 }
-template <> inline constexpr binary_format<double>::equiv_uint
-    binary_format<double>::mantissa_mask() {
+template <>
+inline constexpr binary_format<double>::equiv_uint
+binary_format<double>::mantissa_mask() {
   return 0x000FFFFFFFFFFFFF;
 }
 
-template <> inline constexpr binary_format<float>::equiv_uint
-    binary_format<float>::hidden_bit_mask() {
+template <>
+inline constexpr binary_format<float>::equiv_uint
+binary_format<float>::hidden_bit_mask() {
   return 0x00800000;
 }
-template <> inline constexpr binary_format<double>::equiv_uint
-    binary_format<double>::hidden_bit_mask() {
+template <>
+inline constexpr binary_format<double>::equiv_uint
+binary_format<double>::hidden_bit_mask() {
   return 0x0010000000000000;
 }
 
-template<typename T>
-fastfloat_really_inline FASTFLOAT_CONSTEXPR20
-void to_float(bool negative, adjusted_mantissa am, T &value) {
+template <typename T>
+fastfloat_really_inline FASTFLOAT_CONSTEXPR20 void
+to_float(bool negative, adjusted_mantissa am, T &value) {
   using fastfloat_uint = typename binary_format<T>::equiv_uint;
   fastfloat_uint word = (fastfloat_uint)am.mantissa;
-  word |= fastfloat_uint(am.power2) << binary_format<T>::mantissa_explicit_bits();
+  word |= fastfloat_uint(am.power2)
+          << binary_format<T>::mantissa_explicit_bits();
   word |= fastfloat_uint(negative) << binary_format<T>::sign_index();
 #if FASTFLOAT_HAS_BIT_CAST
   value = std::bit_cast<T>(word);
@@ -591,89 +671,132 @@ void to_float(bool negative, adjusted_mantissa am, T &value) {
 }
 
 #ifdef FASTFLOAT_SKIP_WHITE_SPACE // disabled by default
-template <typename = void>
-struct space_lut {
+template <typename = void> struct space_lut {
   static constexpr bool value[] = {
-    0, 0, 0, 0, 0, 0, 0, 0, 0, 1, 1, 1, 1, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
-    0, 0, 0, 0, 0, 0, 0, 0, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
-    0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
-    0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
-    0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
-    0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
-    0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
-    0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
-    0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
-    0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
-    0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0};
+      0, 0, 0, 0, 0, 0, 0, 0, 0, 1, 1, 1, 1, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
+      0, 0, 0, 0, 0, 0, 0, 0, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
+      0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
+      0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
+      0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
+      0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
+      0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
+      0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
+      0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
+      0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
+      0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0};
 };
 
-template <typename T>
-constexpr bool space_lut<T>::value[];
+#if FASTFLOAT_DETAIL_MUST_DEFINE_CONSTEXPR_VARIABLE
+
+template <typename T> constexpr bool space_lut<T>::value[];
+
+#endif
 
 inline constexpr bool is_space(uint8_t c) { return space_lut<>::value[c]; }
 #endif
 
-template<typename UC>
-static constexpr uint64_t int_cmp_zeros()
-{
-    static_assert((sizeof(UC) == 1) || (sizeof(UC) == 2) || (sizeof(UC) == 4), "Unsupported character size");
-    return (sizeof(UC) == 1) ? 0x3030303030303030 : (sizeof(UC) == 2) ? (uint64_t(UC('0')) << 48 | uint64_t(UC('0')) << 32 | uint64_t(UC('0')) << 16 | UC('0')) : (uint64_t(UC('0')) << 32 | UC('0'));
-}
-template<typename UC>
-static constexpr int int_cmp_len()
-{
-    return sizeof(uint64_t) / sizeof(UC);
-}
-template<typename UC>
-static constexpr UC const * str_const_nan()
-{
-    return nullptr;
-}
-template<>
-constexpr char const * str_const_nan<char>()
-{
-    return "nan";
-}
-template<>
-constexpr wchar_t const * str_const_nan<wchar_t>()
-{
-    return L"nan";
-}
-template<>
-constexpr char16_t const * str_const_nan<char16_t>()
-{
-    return u"nan";
-}
-template<>
-constexpr char32_t const * str_const_nan<char32_t>()
-{
-    return U"nan";
-}
-template<typename UC>
-static constexpr UC const * str_const_inf()
-{
-    return nullptr;
-}
-template<>
-constexpr char const * str_const_inf<char>()
-{
-    return "infinity";
-}
-template<>
-constexpr wchar_t const * str_const_inf<wchar_t>()
-{
-    return L"infinity";
-}
-template<>
-constexpr char16_t const * str_const_inf<char16_t>()
-{
-    return u"infinity";
-}
-template<>
-constexpr char32_t const * str_const_inf<char32_t>()
-{
-    return U"infinity";
+template <typename UC> static constexpr uint64_t int_cmp_zeros() {
+  static_assert((sizeof(UC) == 1) || (sizeof(UC) == 2) || (sizeof(UC) == 4),
+                "Unsupported character size");
+  return (sizeof(UC) == 1) ? 0x3030303030303030
+         : (sizeof(UC) == 2)
+             ? (uint64_t(UC('0')) << 48 | uint64_t(UC('0')) << 32 |
+                uint64_t(UC('0')) << 16 | UC('0'))
+             : (uint64_t(UC('0')) << 32 | UC('0'));
+}
+template <typename UC> static constexpr int int_cmp_len() {
+  return sizeof(uint64_t) / sizeof(UC);
+}
+template <typename UC> static constexpr UC const *str_const_nan() {
+  return nullptr;
+}
+template <> constexpr char const *str_const_nan<char>() { return "nan"; }
+template <> constexpr wchar_t const *str_const_nan<wchar_t>() { return L"nan"; }
+template <> constexpr char16_t const *str_const_nan<char16_t>() {
+  return u"nan";
+}
+template <> constexpr char32_t const *str_const_nan<char32_t>() {
+  return U"nan";
+}
+template <typename UC> static constexpr UC const *str_const_inf() {
+  return nullptr;
+}
+template <> constexpr char const *str_const_inf<char>() { return "infinity"; }
+template <> constexpr wchar_t const *str_const_inf<wchar_t>() {
+  return L"infinity";
+}
+template <> constexpr char16_t const *str_const_inf<char16_t>() {
+  return u"infinity";
+}
+template <> constexpr char32_t const *str_const_inf<char32_t>() {
+  return U"infinity";
+}
+
+template <typename = void> struct int_luts {
+  static constexpr uint8_t chdigit[] = {
+      255, 255, 255, 255, 255, 255, 255, 255, 255, 255, 255, 255, 255, 255, 255,
+      255, 255, 255, 255, 255, 255, 255, 255, 255, 255, 255, 255, 255, 255, 255,
+      255, 255, 255, 255, 255, 255, 255, 255, 255, 255, 255, 255, 255, 255, 255,
+      255, 255, 255, 0,   1,   2,   3,   4,   5,   6,   7,   8,   9,   255, 255,
+      255, 255, 255, 255, 255, 10,  11,  12,  13,  14,  15,  16,  17,  18,  19,
+      20,  21,  22,  23,  24,  25,  26,  27,  28,  29,  30,  31,  32,  33,  34,
+      35,  255, 255, 255, 255, 255, 255, 10,  11,  12,  13,  14,  15,  16,  17,
+      18,  19,  20,  21,  22,  23,  24,  25,  26,  27,  28,  29,  30,  31,  32,
+      33,  34,  35,  255, 255, 255, 255, 255, 255, 255, 255, 255, 255, 255, 255,
+      255, 255, 255, 255, 255, 255, 255, 255, 255, 255, 255, 255, 255, 255, 255,
+      255, 255, 255, 255, 255, 255, 255, 255, 255, 255, 255, 255, 255, 255, 255,
+      255, 255, 255, 255, 255, 255, 255, 255, 255, 255, 255, 255, 255, 255, 255,
+      255, 255, 255, 255, 255, 255, 255, 255, 255, 255, 255, 255, 255, 255, 255,
+      255, 255, 255, 255, 255, 255, 255, 255, 255, 255, 255, 255, 255, 255, 255,
+      255, 255, 255, 255, 255, 255, 255, 255, 255, 255, 255, 255, 255, 255, 255,
+      255, 255, 255, 255, 255, 255, 255, 255, 255, 255, 255, 255, 255, 255, 255,
+      255, 255, 255, 255, 255, 255, 255, 255, 255, 255, 255, 255, 255, 255, 255,
+      255};
+
+  static constexpr size_t maxdigits_u64[] = {
+      64, 41, 32, 28, 25, 23, 22, 21, 20, 19, 18, 18, 17, 17, 16, 16, 16, 16,
+      15, 15, 15, 15, 14, 14, 14, 14, 14, 14, 14, 13, 13, 13, 13, 13, 13};
+
+  static constexpr uint64_t min_safe_u64[] = {
+      9223372036854775808ull,  12157665459056928801ull, 4611686018427387904,
+      7450580596923828125,     4738381338321616896,     3909821048582988049,
+      9223372036854775808ull,  12157665459056928801ull, 10000000000000000000ull,
+      5559917313492231481,     2218611106740436992,     8650415919381337933,
+      2177953337809371136,     6568408355712890625,     1152921504606846976,
+      2862423051509815793,     6746640616477458432,     15181127029874798299ull,
+      1638400000000000000,     3243919932521508681,     6221821273427820544,
+      11592836324538749809ull, 876488338465357824,      1490116119384765625,
+      2481152873203736576,     4052555153018976267,     6502111422497947648,
+      10260628712958602189ull, 15943230000000000000ull, 787662783788549761,
+      1152921504606846976,     1667889514952984961,     2386420683693101056,
+      3379220508056640625,     4738381338321616896};
+};
+
+#if FASTFLOAT_DETAIL_MUST_DEFINE_CONSTEXPR_VARIABLE
+
+template <typename T> constexpr uint8_t int_luts<T>::chdigit[];
+
+template <typename T> constexpr size_t int_luts<T>::maxdigits_u64[];
+
+template <typename T> constexpr uint64_t int_luts<T>::min_safe_u64[];
+
+#endif
+
+template <typename UC>
+fastfloat_really_inline constexpr uint8_t ch_to_digit(UC c) {
+  return int_luts<>::chdigit[static_cast<unsigned char>(c)];
 }
+
+fastfloat_really_inline constexpr size_t max_digits_u64(int base) {
+  return int_luts<>::maxdigits_u64[base - 2];
+}
+
+// If a u64 is exactly max_digits_u64() in length, this is
+// the value below which it has definitely overflowed.
+fastfloat_really_inline constexpr uint64_t min_safe_u64(int base) {
+  return int_luts<>::min_safe_u64[base - 2];
+}
+
 } // namespace fast_float
 
 #endif
diff --git a/third_party/fast_float/parse_number.h b/third_party/fast_float/parse_number.h
index a011a8cbf4df..6d883fb96ea1 100644
--- a/third_party/fast_float/parse_number.h
+++ b/third_party/fast_float/parse_number.h
@@ -10,10 +10,8 @@
 #include <cstring>
 #include <limits>
 #include <system_error>
-
 namespace fast_float {
 
-
 namespace detail {
 /**
  * Special case +inf, -inf, nan, infinity, -infinity.
@@ -21,45 +19,53 @@ namespace detail {
  * strings a null-free and fixed.
  **/
 template <typename T, typename UC>
-from_chars_result_t<UC> FASTFLOAT_CONSTEXPR14
-parse_infnan(UC const * first, UC const * last, T &value)  noexcept  {
+from_chars_result_t<UC> FASTFLOAT_CONSTEXPR14 parse_infnan(UC const *first,
+                                                           UC const *last,
+                                                           T &value) noexcept {
   from_chars_result_t<UC> answer{};
   answer.ptr = first;
   answer.ec = std::errc(); // be optimistic
   bool minusSign = false;
-  if (*first == UC('-')) { // assume first < last, so dereference without checks; C++17 20.19.3.(7.1) explicitly forbids '+' here
-      minusSign = true;
-      ++first;
+  if (*first ==
+      UC('-')) { // assume first < last, so dereference without checks;
+                 // C++17 20.19.3.(7.1) explicitly forbids '+' here
+    minusSign = true;
+    ++first;
   }
 #ifdef FASTFLOAT_ALLOWS_LEADING_PLUS // disabled by default
   if (*first == UC('+')) {
-      ++first;
+    ++first;
   }
 #endif
   if (last - first >= 3) {
     if (fastfloat_strncasecmp(first, str_const_nan<UC>(), 3)) {
       answer.ptr = (first += 3);
-      value = minusSign ? -std::numeric_limits<T>::quiet_NaN() : std::numeric_limits<T>::quiet_NaN();
-      // Check for possible nan(n-char-seq-opt), C++17 20.19.3.7, C11 7.20.1.3.3. At least MSVC produces nan(ind) and nan(snan).
-      if(first != last && *first == UC('(')) {
-        for(UC const * ptr = first + 1; ptr != last; ++ptr) {
+      value = minusSign ? -std::numeric_limits<T>::quiet_NaN()
+                        : std::numeric_limits<T>::quiet_NaN();
+      // Check for possible nan(n-char-seq-opt), C++17 20.19.3.7,
+      // C11 7.20.1.3.3. At least MSVC produces nan(ind) and nan(snan).
+      if (first != last && *first == UC('(')) {
+        for (UC const *ptr = first + 1; ptr != last; ++ptr) {
           if (*ptr == UC(')')) {
             answer.ptr = ptr + 1; // valid nan(n-char-seq-opt)
             break;
-          }
-          else if(!((UC('a') <= *ptr && *ptr <= UC('z')) || (UC('A') <= *ptr && *ptr <= UC('Z')) || (UC('0') <= *ptr && *ptr <= UC('9')) || *ptr == UC('_')))
+          } else if (!((UC('a') <= *ptr && *ptr <= UC('z')) ||
+                       (UC('A') <= *ptr && *ptr <= UC('Z')) ||
+                       (UC('0') <= *ptr && *ptr <= UC('9')) || *ptr == UC('_')))
             break; // forbidden char, not nan(n-char-seq-opt)
         }
       }
       return answer;
     }
     if (fastfloat_strncasecmp(first, str_const_inf<UC>(), 3)) {
-      if ((last - first >= 8) && fastfloat_strncasecmp(first + 3, str_const_inf<UC>() + 3, 5)) {
+      if ((last - first >= 8) &&
+          fastfloat_strncasecmp(first + 3, str_const_inf<UC>() + 3, 5)) {
         answer.ptr = first + 8;
       } else {
         answer.ptr = first + 3;
       }
-      value = minusSign ? -std::numeric_limits<T>::infinity() : std::numeric_limits<T>::infinity();
+      value = minusSign ? -std::numeric_limits<T>::infinity()
+                        : std::numeric_limits<T>::infinity();
       return answer;
     }
   }
@@ -89,98 +95,128 @@ fastfloat_really_inline bool rounds_to_nearest() noexcept {
   //
   // The volatile keywoard prevents the compiler from computing the function
   // at compile-time.
-  // There might be other ways to prevent compile-time optimizations (e.g., asm).
-  // The value does not need to be std::numeric_limits<float>::min(), any small
-  // value so that 1 + x should round to 1 would do (after accounting for excess
-  // precision, as in 387 instructions).
+  // There might be other ways to prevent compile-time optimizations (e.g.,
+  // asm). The value does not need to be std::numeric_limits<float>::min(), any
+  // small value so that 1 + x should round to 1 would do (after accounting for
+  // excess precision, as in 387 instructions).
   static volatile float fmin = std::numeric_limits<float>::min();
   float fmini = fmin; // we copy it so that it gets loaded at most once.
-  //
-  // Explanation:
-  // Only when fegetround() == FE_TONEAREST do we have that
-  // fmin + 1.0f == 1.0f - fmin.
-  //
-  // FE_UPWARD:
-  //  fmin + 1.0f > 1
-  //  1.0f - fmin == 1
-  //
-  // FE_DOWNWARD or  FE_TOWARDZERO:
-  //  fmin + 1.0f == 1
-  //  1.0f - fmin < 1
-  //
-  // Note: This may fail to be accurate if fast-math has been
-  // enabled, as rounding conventions may not apply.
-  #ifdef FASTFLOAT_VISUAL_STUDIO
-  #   pragma warning(push)
-  //  todo: is there a VS warning?
-  //  see https://stackoverflow.com/questions/46079446/is-there-a-warning-for-floating-point-equality-checking-in-visual-studio-2013
-  #elif defined(__clang__)
-  #   pragma clang diagnostic push
-  #   pragma clang diagnostic ignored "-Wfloat-equal"
-  #elif defined(__GNUC__)
-  #   pragma GCC diagnostic push
-  #   pragma GCC diagnostic ignored "-Wfloat-equal"
-  #endif
+//
+// Explanation:
+// Only when fegetround() == FE_TONEAREST do we have that
+// fmin + 1.0f == 1.0f - fmin.
+//
+// FE_UPWARD:
+//  fmin + 1.0f > 1
+//  1.0f - fmin == 1
+//
+// FE_DOWNWARD or  FE_TOWARDZERO:
+//  fmin + 1.0f == 1
+//  1.0f - fmin < 1
+//
+// Note: This may fail to be accurate if fast-math has been
+// enabled, as rounding conventions may not apply.
+#ifdef FASTFLOAT_VISUAL_STUDIO
+#pragma warning(push)
+//  todo: is there a VS warning?
+//  see
+//  https://stackoverflow.com/questions/46079446/is-there-a-warning-for-floating-point-equality-checking-in-visual-studio-2013
+#elif defined(__clang__)
+#pragma clang diagnostic push
+#pragma clang diagnostic ignored "-Wfloat-equal"
+#elif defined(__GNUC__)
+#pragma GCC diagnostic push
+#pragma GCC diagnostic ignored "-Wfloat-equal"
+#endif
   return (fmini + 1.0f == 1.0f - fmini);
-  #ifdef FASTFLOAT_VISUAL_STUDIO
-  #   pragma warning(pop)
-  #elif defined(__clang__)
-  #   pragma clang diagnostic pop
-  #elif defined(__GNUC__)
-  #   pragma GCC diagnostic pop
-  #endif
+#ifdef FASTFLOAT_VISUAL_STUDIO
+#pragma warning(pop)
+#elif defined(__clang__)
+#pragma clang diagnostic pop
+#elif defined(__GNUC__)
+#pragma GCC diagnostic pop
+#endif
 }
 
 } // namespace detail
 
-template<typename T, typename UC>
-FASTFLOAT_CONSTEXPR20
-from_chars_result_t<UC> from_chars(UC const * first, UC const * last,
-                             T &value, chars_format fmt /*= chars_format::general*/)  noexcept  {
-  return from_chars_advanced(first, last, value, parse_options_t<UC>{fmt});
+template <typename T> struct from_chars_caller {
+  template <typename UC>
+  FASTFLOAT_CONSTEXPR20 static from_chars_result_t<UC>
+  call(UC const *first, UC const *last, T &value,
+       parse_options_t<UC> options) noexcept {
+    return from_chars_advanced(first, last, value, options);
+  }
+};
+
+#if __STDCPP_FLOAT32_T__ == 1
+template <> struct from_chars_caller<std::float32_t> {
+  template <typename UC>
+  FASTFLOAT_CONSTEXPR20 static from_chars_result_t<UC>
+  call(UC const *first, UC const *last, std::float32_t &value,
+       parse_options_t<UC> options) noexcept {
+    // if std::float32_t is defined, and we are in C++23 mode; macro set for
+    // float32; set value to float due to equivalence between float and
+    // float32_t
+    float val;
+    auto ret = from_chars_advanced(first, last, val, options);
+    value = val;
+    return ret;
+  }
+};
+#endif
+
+#if __STDCPP_FLOAT64_T__ == 1
+template <> struct from_chars_caller<std::float64_t> {
+  template <typename UC>
+  FASTFLOAT_CONSTEXPR20 static from_chars_result_t<UC>
+  call(UC const *first, UC const *last, std::float64_t &value,
+       parse_options_t<UC> options) noexcept {
+    // if std::float64_t is defined, and we are in C++23 mode; macro set for
+    // float64; set value as double due to equivalence between double and
+    // float64_t
+    double val;
+    auto ret = from_chars_advanced(first, last, val, options);
+    value = val;
+    return ret;
+  }
+};
+#endif
+
+template <typename T, typename UC, typename>
+FASTFLOAT_CONSTEXPR20 from_chars_result_t<UC>
+from_chars(UC const *first, UC const *last, T &value,
+           chars_format fmt /*= chars_format::general*/) noexcept {
+  return from_chars_caller<T>::call(first, last, value,
+                                    parse_options_t<UC>(fmt));
 }
 
-template<typename T, typename UC>
-FASTFLOAT_CONSTEXPR20
-from_chars_result_t<UC> from_chars_advanced(UC const * first, UC const * last,
-                                      T &value, parse_options_t<UC> options)  noexcept  {
+/**
+ * This function overload takes parsed_number_string_t structure that is created
+ * and populated either by from_chars_advanced function taking chars range and
+ * parsing options or other parsing custom function implemented by user.
+ */
+template <typename T, typename UC>
+FASTFLOAT_CONSTEXPR20 from_chars_result_t<UC>
+from_chars_advanced(parsed_number_string_t<UC> &pns, T &value) noexcept {
 
-  static_assert (std::is_same<T, double>::value || std::is_same<T, float>::value, "only float and double are supported");
-  static_assert (std::is_same<UC, char>::value ||
-                 std::is_same<UC, wchar_t>::value ||
-                 std::is_same<UC, char16_t>::value ||
-                 std::is_same<UC, char32_t>::value , "only char, wchar_t, char16_t and char32_t are supported");
+  static_assert(is_supported_float_type<T>(),
+                "only some floating-point types are supported");
+  static_assert(is_supported_char_type<UC>(),
+                "only char, wchar_t, char16_t and char32_t are supported");
 
   from_chars_result_t<UC> answer;
-#ifdef FASTFLOAT_SKIP_WHITE_SPACE  // disabled by default
-  while ((first != last) && fast_float::is_space(uint8_t(*first))) {
-    first++;
-  }
-#endif
-  if (first == last) {
-    answer.ec = std::errc::invalid_argument;
-    answer.ptr = first;
-    return answer;
-  }
-  parsed_number_string_t<UC> pns = parse_number_string<UC>(first, last, options);
-  if (!pns.valid) {
-    if (options.format & chars_format::no_infnan) {
-      answer.ec = std::errc::invalid_argument;
-      answer.ptr = first;
-      return answer;
-    } else {
-      return detail::parse_infnan(first, last, value);
-    }
-  }
 
   answer.ec = std::errc(); // be optimistic
   answer.ptr = pns.lastmatch;
   // The implementation of the Clinger's fast path is convoluted because
   // we want round-to-nearest in all cases, irrespective of the rounding mode
   // selected on the thread.
-  // We proceed optimistically, assuming that detail::rounds_to_nearest() returns
-  // true.
-  if (binary_format<T>::min_exponent_fast_path() <= pns.exponent && pns.exponent <= binary_format<T>::max_exponent_fast_path() && !pns.too_many_digits) {
+  // We proceed optimistically, assuming that detail::rounds_to_nearest()
+  // returns true.
+  if (binary_format<T>::min_exponent_fast_path() <= pns.exponent &&
+      pns.exponent <= binary_format<T>::max_exponent_fast_path() &&
+      !pns.too_many_digits) {
     // Unfortunately, the conventional Clinger's fast path is only possible
     // when the system rounds to the nearest float.
     //
@@ -188,50 +224,123 @@ from_chars_result_t<UC> from_chars_advanced(UC const * first, UC const * last,
     // We could check it first (before the previous branch), but
     // there might be performance advantages at having the check
     // be last.
-    if(!cpp20_and_in_constexpr() && detail::rounds_to_nearest())  {
+    if (!cpp20_and_in_constexpr() && detail::rounds_to_nearest()) {
       // We have that fegetround() == FE_TONEAREST.
       // Next is Clinger's fast path.
-      if (pns.mantissa <=binary_format<T>::max_mantissa_fast_path()) {
+      if (pns.mantissa <= binary_format<T>::max_mantissa_fast_path()) {
         value = T(pns.mantissa);
-        if (pns.exponent < 0) { value = value / binary_format<T>::exact_power_of_ten(-pns.exponent); }
-        else { value = value * binary_format<T>::exact_power_of_ten(pns.exponent); }
-        if (pns.negative) { value = -value; }
+        if (pns.exponent < 0) {
+          value = value / binary_format<T>::exact_power_of_ten(-pns.exponent);
+        } else {
+          value = value * binary_format<T>::exact_power_of_ten(pns.exponent);
+        }
+        if (pns.negative) {
+          value = -value;
+        }
         return answer;
       }
     } else {
       // We do not have that fegetround() == FE_TONEAREST.
-      // Next is a modified Clinger's fast path, inspired by Jakub Jelínek's proposal
-      if (pns.exponent >= 0 && pns.mantissa <=binary_format<T>::max_mantissa_fast_path(pns.exponent)) {
-#if defined(__clang__)
+      // Next is a modified Clinger's fast path, inspired by Jakub Jelínek's
+      // proposal
+      if (pns.exponent >= 0 &&
+          pns.mantissa <=
+              binary_format<T>::max_mantissa_fast_path(pns.exponent)) {
+#if defined(__clang__) || defined(FASTFLOAT_32BIT)
         // Clang may map 0 to -0.0 when fegetround() == FE_DOWNWARD
-        if(pns.mantissa == 0) {
-          value = pns.negative ? -0. : 0.;
+        if (pns.mantissa == 0) {
+          value = pns.negative ? T(-0.) : T(0.);
           return answer;
         }
 #endif
-        value = T(pns.mantissa) * binary_format<T>::exact_power_of_ten(pns.exponent);
-        if (pns.negative) { value = -value; }
+        value = T(pns.mantissa) *
+                binary_format<T>::exact_power_of_ten(pns.exponent);
+        if (pns.negative) {
+          value = -value;
+        }
         return answer;
       }
     }
   }
-  adjusted_mantissa am = compute_float<binary_format<T>>(pns.exponent, pns.mantissa);
-  if(pns.too_many_digits && am.power2 >= 0) {
-    if(am != compute_float<binary_format<T>>(pns.exponent, pns.mantissa + 1)) {
+  adjusted_mantissa am =
+      compute_float<binary_format<T>>(pns.exponent, pns.mantissa);
+  if (pns.too_many_digits && am.power2 >= 0) {
+    if (am != compute_float<binary_format<T>>(pns.exponent, pns.mantissa + 1)) {
       am = compute_error<binary_format<T>>(pns.exponent, pns.mantissa);
     }
   }
-  // If we called compute_float<binary_format<T>>(pns.exponent, pns.mantissa) and we have an invalid power (am.power2 < 0),
-  // then we need to go the long way around again. This is very uncommon.
-  if(am.power2 < 0) { am = digit_comp<T>(pns, am); }
+  // If we called compute_float<binary_format<T>>(pns.exponent, pns.mantissa)
+  // and we have an invalid power (am.power2 < 0), then we need to go the long
+  // way around again. This is very uncommon.
+  if (am.power2 < 0) {
+    am = digit_comp<T>(pns, am);
+  }
   to_float(pns.negative, am, value);
   // Test for over/underflow.
-  if ((pns.mantissa != 0 && am.mantissa == 0 && am.power2 == 0) || am.power2 == binary_format<T>::infinite_power()) {
+  if ((pns.mantissa != 0 && am.mantissa == 0 && am.power2 == 0) ||
+      am.power2 == binary_format<T>::infinite_power()) {
     answer.ec = std::errc::result_out_of_range;
   }
   return answer;
 }
 
+template <typename T, typename UC>
+FASTFLOAT_CONSTEXPR20 from_chars_result_t<UC>
+from_chars_advanced(UC const *first, UC const *last, T &value,
+                    parse_options_t<UC> options) noexcept {
+
+  static_assert(is_supported_float_type<T>(),
+                "only some floating-point types are supported");
+  static_assert(is_supported_char_type<UC>(),
+                "only char, wchar_t, char16_t and char32_t are supported");
+
+  from_chars_result_t<UC> answer;
+#ifdef FASTFLOAT_SKIP_WHITE_SPACE // disabled by default
+  while ((first != last) && fast_float::is_space(uint8_t(*first))) {
+    first++;
+  }
+#endif
+  if (first == last) {
+    answer.ec = std::errc::invalid_argument;
+    answer.ptr = first;
+    return answer;
+  }
+  parsed_number_string_t<UC> pns =
+      parse_number_string<UC>(first, last, options);
+  if (!pns.valid) {
+    if (options.format & chars_format::no_infnan) {
+      answer.ec = std::errc::invalid_argument;
+      answer.ptr = first;
+      return answer;
+    } else {
+      return detail::parse_infnan(first, last, value);
+    }
+  }
+
+  // call overload that takes parsed_number_string_t directly.
+  return from_chars_advanced(pns, value);
+}
+
+template <typename T, typename UC, typename>
+FASTFLOAT_CONSTEXPR20 from_chars_result_t<UC>
+from_chars(UC const *first, UC const *last, T &value, int base) noexcept {
+  static_assert(is_supported_char_type<UC>(),
+                "only char, wchar_t, char16_t and char32_t are supported");
+
+  from_chars_result_t<UC> answer;
+#ifdef FASTFLOAT_SKIP_WHITE_SPACE // disabled by default
+  while ((first != last) && fast_float::is_space(uint8_t(*first))) {
+    first++;
+  }
+#endif
+  if (first == last || base < 2 || base > 36) {
+    answer.ec = std::errc::invalid_argument;
+    answer.ptr = first;
+    return answer;
+  }
+  return parse_int_string(first, last, value, base);
+}
+
 } // namespace fast_float
 
 #endif

From 361754b9d5f6c906781f28caefc1cf363a9a81a8 Mon Sep 17 00:00:00 2001
From: Even Rouault <even.rouault@spatialys.com>
Date: Wed, 6 Nov 2024 02:42:59 +0100
Subject: [PATCH 33/62] Doc: software_using_gdal.rst: mention OpenDataCube

---
 doc/source/software_using_gdal.rst | 1 +
 1 file changed, 1 insertion(+)

diff --git a/doc/source/software_using_gdal.rst b/doc/source/software_using_gdal.rst
index 7886d4df8b77..e5ae5e161eab 100644
--- a/doc/source/software_using_gdal.rst
+++ b/doc/source/software_using_gdal.rst
@@ -44,6 +44,7 @@ Free and open source
 - `NextGIS Web  <http://nextgis.com/nextgis-web>`_ Server-side Web GIS and a framework for storage, visualization and permissions management of all kinds
 - `Ogr2 GUI  <https://sourceforge.net/projects/ogr2gui/>`_ Graphical user interface for ogr2ogr
 - `OpenCPN  <http://opencpn.org>`_  A concise ChartPlotter/Navigator. A cross-platform ship-borne GUI application.
+- `OpenDataCube  <https://www.opendatacube.org>`_  FOSS software package that simplifies the management and analysis of large amounts of satellite imagery and other Earth observation data.
 - `OpenEV  <http://openev.sourceforge.net>`_  An OpenGL/GTK/Python based graphical viewer which exclusively uses GDAL for raster access.
 - `OFGT <https://github.com/openforis/geospatial-toolkit>`_  a collection of utilities for multipurpose forest monitoring under the `Open Foris Initiative <http://km.fao.org/OFwiki/index.php/Main_Page>`_ Open Foris Initiative.
 - `OpenFLUID  <https://www.openfluid-project.org>`_  a software platform for spatial modelling of landscapes dynamics

From 825c659bf63fd40ec21ff87854b3c1843ee2d23b Mon Sep 17 00:00:00 2001
From: Even Rouault <even.rouault@spatialys.com>
Date: Wed, 6 Nov 2024 16:29:56 +0100
Subject: [PATCH 34/62] CITATION.cff: update with 3.10.0 [ci skip]

---
 CITATION.cff | 4 ++--
 1 file changed, 2 insertions(+), 2 deletions(-)

diff --git a/CITATION.cff b/CITATION.cff
index a4b3a817404c..f55417d0e5bb 100644
--- a/CITATION.cff
+++ b/CITATION.cff
@@ -2,8 +2,8 @@ cff-version: 1.2.0
 message: Please cite this software using these metadata or in the CITATION file.
 type: software
 title: GDAL
-version: 3.8.3
-date-released: 2024-01-02
+version: 3.10.0
+date-released: 2024-11-01
 doi: 10.5281/zenodo.5884351
 abstract: GDAL is a translator library for raster and vector geospatial data
   formats that is released under an MIT style Open Source License by the Open

From 611cf4ca9fa0a911a1981a9e4fc576b9411a53b5 Mon Sep 17 00:00:00 2001
From: Even Rouault <even.rouault@spatialys.com>
Date: Tue, 29 Oct 2024 11:34:19 +0100
Subject: [PATCH 35/62] Doc: advertize 3.10.0

---
 doc/source/about_no_title.rst | 4 ++--
 doc/source/download.rst       | 8 ++++----
 doc/source/download_past.rst  | 6 ++++++
 3 files changed, 12 insertions(+), 6 deletions(-)

diff --git a/doc/source/about_no_title.rst b/doc/source/about_no_title.rst
index c60c858cce3e..ccaa6d31b24f 100644
--- a/doc/source/about_no_title.rst
+++ b/doc/source/about_no_title.rst
@@ -1,4 +1,4 @@
-GDAL is a translator library for raster and vector geospatial data formats that is released under an MIT style Open Source :ref:`license` by the `Open Source Geospatial Foundation`_. As a library, it presents a single raster abstract data model and single vector abstract data model to the calling application for all supported formats. It also comes with a variety of useful command line utilities for data translation and processing. The `NEWS`_ page describes the October 2024 GDAL/OGR 3.9.3 release.
+GDAL is a translator library for raster and vector geospatial data formats that is released under an MIT style Open Source :ref:`license` by the `Open Source Geospatial Foundation`_. As a library, it presents a single raster abstract data model and single vector abstract data model to the calling application for all supported formats. It also comes with a variety of useful command line utilities for data translation and processing. The `NEWS`_ page describes the October 2024 GDAL/OGR 3.10.0 release.
 
 .. note::
 
@@ -17,7 +17,7 @@ GDAL is a translator library for raster and vector geospatial data formats that
    :target:  `Open Source Geospatial Foundation`_
 
 .. _`Open Source Geospatial Foundation`: http://www.osgeo.org/
-.. _`NEWS`: https://github.com/OSGeo/gdal/blob/v3.9.3/NEWS.md
+.. _`NEWS`: https://github.com/OSGeo/gdal/blob/v3.10.0/NEWS.md
 
 See :ref:`software_using_gdal`
 
diff --git a/doc/source/download.rst b/doc/source/download.rst
index ca336557fdc3..33676e67289b 100644
--- a/doc/source/download.rst
+++ b/doc/source/download.rst
@@ -18,11 +18,11 @@ Source Code
 Current Release
 ...............
 
-* **2024-10-14** `gdal-3.9.3.tar.gz`_ `3.9.3 Release Notes`_ (`3.9.3 md5`_)
+* **2024-10-29** `gdal-3.10.0.tar.gz`_ `3.10.0 Release Notes`_ (`3.10.0 md5`_)
 
-.. _`3.9.3 Release Notes`: https://github.com/OSGeo/gdal/blob/v3.9.3/NEWS.md
-.. _`gdal-3.9.3.tar.gz`: https://github.com/OSGeo/gdal/releases/download/v3.9.3/gdal-3.9.3.tar.gz
-.. _`3.9.3 md5`: https://github.com/OSGeo/gdal/releases/download/v3.9.3/gdal-3.9.3.tar.gz.md5
+.. _`3.10.0 Release Notes`: https://github.com/OSGeo/gdal/blob/v3.10.0/NEWS.md
+.. _`gdal-3.10.0.tar.gz`: https://github.com/OSGeo/gdal/releases/download/v3.10.0/gdal-3.10.0.tar.gz
+.. _`3.10.0 md5`: https://github.com/OSGeo/gdal/releases/download/v3.10.0/gdal-3.10.0.tar.gz.md5
 
 Past Releases
 .............
diff --git a/doc/source/download_past.rst b/doc/source/download_past.rst
index f732485793ef..afff6413df04 100644
--- a/doc/source/download_past.rst
+++ b/doc/source/download_past.rst
@@ -5,6 +5,12 @@
 Past Releases
 =============
 
+* **2024-10-14** `gdal-3.9.3.tar.gz`_ `3.9.3 Release Notes`_ (`3.9.3 md5`_)
+
+.. _`3.9.3 Release Notes`: https://github.com/OSGeo/gdal/blob/v3.9.3/NEWS.md
+.. _`gdal-3.9.3.tar.gz`: https://github.com/OSGeo/gdal/releases/download/v3.9.3/gdal-3.9.3.tar.gz
+.. _`3.9.3 md5`: https://github.com/OSGeo/gdal/releases/download/v3.9.3/gdal-3.9.3.tar.gz.md5
+
 * **2024-08-16** `gdal-3.9.2.tar.gz`_ `3.9.2 Release Notes`_ (`3.9.2 md5`_)
 
 .. _`3.9.2 Release Notes`: https://github.com/OSGeo/gdal/blob/v3.9.2/NEWS.md

From 8ed00e70825faa98feb47a2241e6409a87a7146b Mon Sep 17 00:00:00 2001
From: Even Rouault <even.rouault@spatialys.com>
Date: Wed, 30 Oct 2024 13:33:41 +0100
Subject: [PATCH 36/62] doc/source/download.rst: update date

---
 doc/source/download.rst | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/doc/source/download.rst b/doc/source/download.rst
index 33676e67289b..0fd76b3c2f71 100644
--- a/doc/source/download.rst
+++ b/doc/source/download.rst
@@ -18,7 +18,7 @@ Source Code
 Current Release
 ...............
 
-* **2024-10-29** `gdal-3.10.0.tar.gz`_ `3.10.0 Release Notes`_ (`3.10.0 md5`_)
+* **2024-10-30** `gdal-3.10.0.tar.gz`_ `3.10.0 Release Notes`_ (`3.10.0 md5`_)
 
 .. _`3.10.0 Release Notes`: https://github.com/OSGeo/gdal/blob/v3.10.0/NEWS.md
 .. _`gdal-3.10.0.tar.gz`: https://github.com/OSGeo/gdal/releases/download/v3.10.0/gdal-3.10.0.tar.gz

From 3e9451d8b2cdbd8ff3e252529757b9cd878b8252 Mon Sep 17 00:00:00 2001
From: Even Rouault <even.rouault@spatialys.com>
Date: Wed, 6 Nov 2024 16:58:39 +0100
Subject: [PATCH 37/62] HOWTO-RELEASE: add paragraph about ReadTheDocs [ci
 skip]

---
 HOWTO-RELEASE | 7 +++++++
 1 file changed, 7 insertions(+)

diff --git a/HOWTO-RELEASE b/HOWTO-RELEASE
index 684d3b589afd..46c735f003b6 100644
--- a/HOWTO-RELEASE
+++ b/HOWTO-RELEASE
@@ -358,3 +358,10 @@ or your message manually approved, with an administrator of the list.
 
 23) For bugfixes releases, forward port to master changes done in
     doc/source/about_no_title.rst, doc/source/download.rst and doc/source/download_past.rst
+
+24) For a feature release, enable a new version in the ReadTheDocs administration panel.
+    a) Go to https://readthedocs.org/projects/gdal/versions/
+    b) In the "Activate a version" tab, enter "release/X.Y" in the text entry and click on the Filter button
+    c) Click on the Activate button
+    d) Go to https://readthedocs.org/projects/gdal/, and in the "Compile a version" drop-down list,
+       select the newt "release-X.Y" label and click on the "Compile a version" button

From 174ea0077f83a431cf0bedf86dbd5f4971659fa6 Mon Sep 17 00:00:00 2001
From: Even Rouault <even.rouault@spatialys.com>
Date: Fri, 1 Nov 2024 16:41:33 +0100
Subject: [PATCH 38/62] Doc: update 3.10.0 release date

---
 doc/source/about_no_title.rst | 2 +-
 doc/source/download.rst       | 2 +-
 2 files changed, 2 insertions(+), 2 deletions(-)

diff --git a/doc/source/about_no_title.rst b/doc/source/about_no_title.rst
index ccaa6d31b24f..819c4aca9f4a 100644
--- a/doc/source/about_no_title.rst
+++ b/doc/source/about_no_title.rst
@@ -1,4 +1,4 @@
-GDAL is a translator library for raster and vector geospatial data formats that is released under an MIT style Open Source :ref:`license` by the `Open Source Geospatial Foundation`_. As a library, it presents a single raster abstract data model and single vector abstract data model to the calling application for all supported formats. It also comes with a variety of useful command line utilities for data translation and processing. The `NEWS`_ page describes the October 2024 GDAL/OGR 3.10.0 release.
+GDAL is a translator library for raster and vector geospatial data formats that is released under an MIT style Open Source :ref:`license` by the `Open Source Geospatial Foundation`_. As a library, it presents a single raster abstract data model and single vector abstract data model to the calling application for all supported formats. It also comes with a variety of useful command line utilities for data translation and processing. The `NEWS`_ page describes the November 2024 GDAL/OGR 3.10.0 release.
 
 .. note::
 
diff --git a/doc/source/download.rst b/doc/source/download.rst
index 0fd76b3c2f71..847f3a8e3b1d 100644
--- a/doc/source/download.rst
+++ b/doc/source/download.rst
@@ -18,7 +18,7 @@ Source Code
 Current Release
 ...............
 
-* **2024-10-30** `gdal-3.10.0.tar.gz`_ `3.10.0 Release Notes`_ (`3.10.0 md5`_)
+* **2024-11-01** `gdal-3.10.0.tar.gz`_ `3.10.0 Release Notes`_ (`3.10.0 md5`_)
 
 .. _`3.10.0 Release Notes`: https://github.com/OSGeo/gdal/blob/v3.10.0/NEWS.md
 .. _`gdal-3.10.0.tar.gz`: https://github.com/OSGeo/gdal/releases/download/v3.10.0/gdal-3.10.0.tar.gz

From 73d89b78fcd25be68a4184b0457287b45c1d9b09 Mon Sep 17 00:00:00 2001
From: Even Rouault <even.rouault@spatialys.com>
Date: Wed, 6 Nov 2024 20:09:13 +0100
Subject: [PATCH 39/62] Doc: stop promoting unregulated far-right medium

---
 doc/source/community/code_of_conduct.rst | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/doc/source/community/code_of_conduct.rst b/doc/source/community/code_of_conduct.rst
index 9dc38d42ac5b..b151edb5daf9 100644
--- a/doc/source/community/code_of_conduct.rst
+++ b/doc/source/community/code_of_conduct.rst
@@ -19,7 +19,7 @@ claims any affiliation with the GDAL project.
 
 It applies to in-person events (such as conferences and related social events),
 IRC, public and private mailing lists, the issue tracker, the wiki, blogs,
-Twitter, and any other forums which the community uses for communication and
+social media, and any other forums which the community uses for communication and
 interactions.
 
 This code is not exhaustive or complete. It serves to distill our common

From 1f6defc9c7f61a6037955f9aa64a2e7bae7c9029 Mon Sep 17 00:00:00 2001
From: Even Rouault <even.rouault@spatialys.com>
Date: Thu, 7 Nov 2024 02:26:56 +0100
Subject: [PATCH 40/62] ogr_recordbatch.h: replace pragma once by regular
 inclusion guard to make -Wpedantic happy

---
 ogr/ogr_recordbatch.h | 5 ++++-
 1 file changed, 4 insertions(+), 1 deletion(-)

diff --git a/ogr/ogr_recordbatch.h b/ogr/ogr_recordbatch.h
index 6fcde1405b65..48d1a8d7c874 100644
--- a/ogr/ogr_recordbatch.h
+++ b/ogr/ogr_recordbatch.h
@@ -19,7 +19,8 @@
 // https://github.com/apache/arrow/blob/main/cpp/src/arrow/c/abi.h WARNING: DO
 // NOT MODIFY the content as it would break interoperability !
 
-#pragma once
+#ifndef OGR_RECORDBATCH_H_INCLUDED
+#define OGR_RECORDBATCH_H_INCLUDED
 
 /*! @cond Doxygen_Suppress */
 
@@ -123,3 +124,5 @@ extern "C"
 #endif
 
 /*! @endcond */
+
+#endif  // OGR_RECORDBATCH_H_INCLUDED

From 8177ae3b68a2b531d35c6efc11c3818f6653764f Mon Sep 17 00:00:00 2001
From: Even Rouault <even.rouault@spatialys.com>
Date: Thu, 7 Nov 2024 02:27:50 +0100
Subject: [PATCH 41/62] CI: check that all our public headers can be compiled
 with -Wall -Wpedantic

Relates to https://github.com/OSGeo/grass/issues/4585
---
 autotest/postinstall/test_gdal-config.sh | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/autotest/postinstall/test_gdal-config.sh b/autotest/postinstall/test_gdal-config.sh
index 19a656ee04ae..e77088e12e7b 100755
--- a/autotest/postinstall/test_gdal-config.sh
+++ b/autotest/postinstall/test_gdal-config.sh
@@ -102,7 +102,7 @@ set -eu
 CXX="${CXX:-c++}"
 echo "Test that we can compile all headers with C++11 using ${CXX}"
 for i in $prefix/include/*.h; do
-  ${CXX} -std=c++11 -c $(${GDAL_CONFIG} --cflags) $i;
+  ${CXX} -Wall -Wpedantic -std=c++11 -c $(${GDAL_CONFIG} --cflags) $i;
 done
 
 echo "$ERRORS tests failed out of $NTESTS"

From b47a2f207280580be62589a8f79591143d9b6840 Mon Sep 17 00:00:00 2001
From: Daniel Baston <dbaston@gmail.com>
Date: Thu, 7 Nov 2024 12:52:47 -0500
Subject: [PATCH 42/62] CPLDebug: Accept values of YES,TRUE,1

Fixes https://github.com/OSGeo/gdal/issues/11219
---
 autotest/gcore/misc.py | 65 ++++++++++++++++++++++++++++++++++++++++++
 port/cpl_error.cpp     | 13 +++++++--
 2 files changed, 75 insertions(+), 3 deletions(-)

diff --git a/autotest/gcore/misc.py b/autotest/gcore/misc.py
index 18d1f4c806c0..62f8b29a5663 100755
--- a/autotest/gcore/misc.py
+++ b/autotest/gcore/misc.py
@@ -13,6 +13,7 @@
 # SPDX-License-Identifier: MIT
 ###############################################################################
 
+import datetime
 import os
 import shutil
 
@@ -721,6 +722,70 @@ def test_misc_13():
     assert out_ds is None
 
 
+###############################################################################
+# Test parsing of CPL_DEBUG and CPL_TIMESTAMP
+
+
+@pytest.fixture
+def debug_output():
+
+    messages = []
+
+    def handle(ecls, ecode, emsg):
+        messages.append(emsg)
+
+    def log_message(category, message):
+        messages.clear()
+        gdal.Debug(category, message)
+        return messages[0] if messages else None
+
+    log_message.handle = handle
+
+    with gdaltest.error_handler(handle):
+        yield log_message
+
+
+@pytest.mark.parametrize(
+    "booleans",
+    [("YES", "NO"), ("TRUE", "FALSE"), ("ON", "OFF"), ("1", "0")],
+    ids="_".join,
+)
+def test_misc_cpl_debug(debug_output, booleans):
+
+    on, off = booleans
+
+    assert debug_output("GDAL", "msg") is None
+
+    with gdal.config_option("CPL_DEBUG", off):
+        assert debug_output("GDAL", "msg") is None
+
+    with gdal.config_option("CPL_DEBUG", on):
+        assert debug_output("GDAL", "message") == "GDAL: message"
+
+        with gdal.config_option("CPL_TIMESTAMP", off):
+            assert debug_output("GDAL", "message") == "GDAL: message"
+
+        with gdal.config_option("CPL_TIMESTAMP", on):
+            output = debug_output("GDAL", "message")
+            assert str(datetime.datetime.now().year) in output
+            assert output.endswith("GDAL: message")
+
+
+def test_misc_cpl_debug_filtering(debug_output):
+
+    with gdal.config_option("CPL_DEBUG", "GDAL"):
+        assert debug_output("GDAL", "msg") == "GDAL: msg"
+        assert debug_output("GDAL_WARP", "msg") is None
+        assert debug_output("", "msg") == ": msg"
+
+    with gdal.config_option("CPL_DEBUG", "GDAL_WARP_TRANSLATE_ETC"):
+        assert debug_output("GDAL", "msg") == "GDAL: msg"
+        assert debug_output("TRANSLATE", "msg") == "TRANSLATE: msg"
+
+    with gdal.config_option("CPL_DEBUG", ""):
+        assert debug_output("GDAL", "msg") == "GDAL: msg"
+
+
 ###############################################################################
 # Test ConfigureLogging()
 
diff --git a/port/cpl_error.cpp b/port/cpl_error.cpp
index f751837377b0..b25c6511d4aa 100644
--- a/port/cpl_error.cpp
+++ b/port/cpl_error.cpp
@@ -590,11 +590,18 @@ static void CPLvDebug(const char *pszCategory,
     /* -------------------------------------------------------------------- */
     /*      Does this message pass our current criteria?                    */
     /* -------------------------------------------------------------------- */
-    if (pszDebug == nullptr)
+    if (pszDebug == nullptr || EQUAL(pszDebug, "NO") ||
+        EQUAL(pszDebug, "OFF") || EQUAL(pszDebug, "FALSE") ||
+        EQUAL(pszDebug, "0"))
+    {
         return;
+    }
 
-    if (!EQUAL(pszDebug, "ON") && !EQUAL(pszDebug, ""))
+    if (!EQUAL(pszDebug, "ON") && !EQUAL(pszDebug, "YES") &&
+        !EQUAL(pszDebug, "TRUE") && !EQUAL(pszDebug, "1") &&
+        !EQUAL(pszDebug, ""))
     {
+        // check if value of CPL_DEBUG contains the category
         const size_t nLen = strlen(pszCategory);
 
         size_t i = 0;
@@ -623,7 +630,7 @@ static void CPLvDebug(const char *pszCategory,
 
     pszMessage[0] = '\0';
 #ifdef TIMESTAMP_DEBUG
-    if (CPLGetConfigOption("CPL_TIMESTAMP", nullptr) != nullptr)
+    if (CPLTestBool(CPLGetConfigOption("CPL_TIMESTAMP", "NO")))
     {
         static struct CPLTimeVal tvStart;
         static const auto unused = CPLGettimeofday(&tvStart, nullptr);

From 35c60731f511de58703952491248c6159ee763d2 Mon Sep 17 00:00:00 2001
From: Even Rouault <even.rouault@spatialys.com>
Date: Thu, 7 Nov 2024 20:02:27 +0100
Subject: [PATCH 43/62] docker/README.md: advertize 3.10.0 [ci skip]

---
 docker/README.md | 10 +++++-----
 1 file changed, 5 insertions(+), 5 deletions(-)

diff --git a/docker/README.md b/docker/README.md
index 390cea55b7f7..6ade587c2ca0 100644
--- a/docker/README.md
+++ b/docker/README.md
@@ -100,11 +100,11 @@ If you are getting a ``<jemalloc>: arena 0 background thread creation failed (1)
 
 # Images of releases
 
-Tagged images of recent past releases are available. The last ones (at time of writing) are for GDAL 3.9.3 and PROJ 9.5.0, for linux/amd64 and linux/arm64:
-* ghcr.io/osgeo/gdal:alpine-small-3.9.3
-* ghcr.io/osgeo/gdal:alpine-normal-3.9.3
-* ghcr.io/osgeo/gdal:ubuntu-small-3.9.3
-* ghcr.io/osgeo/gdal:ubuntu-full-3.9.3
+Tagged images of recent past releases are available. The last ones (at time of writing) are for GDAL 3.10.0 and PROJ 9.5.0, for linux/amd64 and linux/arm64:
+* ghcr.io/osgeo/gdal:alpine-small-3.10.0
+* ghcr.io/osgeo/gdal:alpine-normal-3.10.0
+* ghcr.io/osgeo/gdal:ubuntu-small-3.10.0
+* ghcr.io/osgeo/gdal:ubuntu-full-3.10.0
 
 ## Multi-arch Images
 

From 3050b5c7d66bee960bdfe93d216b7a6f1fd35140 Mon Sep 17 00:00:00 2001
From: Even Rouault <even.rouault@spatialys.com>
Date: Thu, 7 Nov 2024 20:07:17 +0100
Subject: [PATCH 44/62] CI: try to solve issue with maxim-lobanov/setup-xcode
 action no longer accepting xcode 14.3

---
 .github/workflows/cmake_builds.yml | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/.github/workflows/cmake_builds.yml b/.github/workflows/cmake_builds.yml
index 910c84966de4..938179c23dbd 100644
--- a/.github/workflows/cmake_builds.yml
+++ b/.github/workflows/cmake_builds.yml
@@ -606,7 +606,7 @@ jobs:
     - name: Setup xcode
       uses: maxim-lobanov/setup-xcode@60606e260d2fc5762a71e64e74b2174e8ea3c8bd # v1.6.0
       with:
-        xcode-version: 14.3
+        xcode-version: '15.4.0'
     - name: Checkout GDAL
       uses: actions/checkout@d632683dd7b4114ad314bca15554477dd762a938 # v4.2.0
     - name: Setup cache

From 6ebf24cbb56677b5086848114fb4e8bec94cc8d2 Mon Sep 17 00:00:00 2001
From: Even Rouault <even.rouault@spatialys.com>
Date: Sat, 19 Oct 2024 00:25:26 +0200
Subject: [PATCH 45/62] Add read-only AIVector (Artificial intelligence powered
 vector) driver

---
 .../ubuntu_24.04/expected_ogrinfo_formats.txt |   1 +
 ...windows_conda_expected_ogrinfo_formats.txt |   1 +
 autotest/ogr/ogr_aivector.py                  |  33 ++++
 doc/source/drivers/vector/aivector.rst        |  32 ++++
 doc/source/drivers/vector/index.rst           |   1 +
 frmts/drivers.ini                             |   3 +
 ogr/ogrsf_frmts/CMakeLists.txt                |   2 +
 ogr/ogrsf_frmts/aivector/CMakeLists.txt       |   9 ++
 .../aivector/ograivectordriver.cpp            | 146 ++++++++++++++++++
 ogr/ogrsf_frmts/generic/ogrregisterall.cpp    |   5 +
 ogr/ogrsf_frmts/ogrsf_frmts.h                 |   1 +
 11 files changed, 234 insertions(+)
 create mode 100755 autotest/ogr/ogr_aivector.py
 create mode 100644 doc/source/drivers/vector/aivector.rst
 create mode 100644 ogr/ogrsf_frmts/aivector/CMakeLists.txt
 create mode 100644 ogr/ogrsf_frmts/aivector/ograivectordriver.cpp

diff --git a/.github/workflows/ubuntu_24.04/expected_ogrinfo_formats.txt b/.github/workflows/ubuntu_24.04/expected_ogrinfo_formats.txt
index 5ad73e12cb5b..859cb834ec11 100644
--- a/.github/workflows/ubuntu_24.04/expected_ogrinfo_formats.txt
+++ b/.github/workflows/ubuntu_24.04/expected_ogrinfo_formats.txt
@@ -86,4 +86,5 @@ Supported Formats: (ro:read-only, rw:read-write, +:update, v:virtual-I/O s:subda
   TIGER -vector- (rov): U.S. Census TIGER/Line
   AVCBin -vector- (rov): Arc/Info Binary Coverage
   AVCE00 -vector- (rov): Arc/Info E00 (ASCII) Coverage (*.e00)
+  AIVector -vector- (ro): Artificial Intelligence powered vector driver
   HTTP -raster,vector- (ro): HTTP Fetching Wrapper
diff --git a/.github/workflows/windows_conda_expected_ogrinfo_formats.txt b/.github/workflows/windows_conda_expected_ogrinfo_formats.txt
index 7d3561b01a0b..1bcea0382853 100644
--- a/.github/workflows/windows_conda_expected_ogrinfo_formats.txt
+++ b/.github/workflows/windows_conda_expected_ogrinfo_formats.txt
@@ -81,4 +81,5 @@ Supported Formats: (ro:read-only, rw:read-write, +:update, v:virtual-I/O s:subda
   TIGER -vector- (rov): U.S. Census TIGER/Line
   AVCBin -vector- (rov): Arc/Info Binary Coverage
   AVCE00 -vector- (rov): Arc/Info E00 (ASCII) Coverage (*.e00)
+  AIVector -vector- (ro): Artificial Intelligence powered vector driver
   HTTP -raster,vector- (ro): HTTP Fetching Wrapper
diff --git a/autotest/ogr/ogr_aivector.py b/autotest/ogr/ogr_aivector.py
new file mode 100755
index 000000000000..32c194666cc9
--- /dev/null
+++ b/autotest/ogr/ogr_aivector.py
@@ -0,0 +1,33 @@
+#!/usr/bin/env pytest
+###############################################################################
+# $Id$
+#
+# Project:  GDAL/OGR Test Suite
+# Purpose:  Test read functionality for OGR AIVector driver.
+# Author:   Even Rouault <even dot rouault at spatialys.com>
+#
+###############################################################################
+# Copyright (c) 2024, Even Rouault <even dot rouault at spatialys.com>
+#
+# SPDX-License-Identifier: MIT
+###############################################################################
+
+import gdaltest
+import pytest
+
+pytestmark = pytest.mark.require_driver("AIVector")
+
+
+def test_ogr_aivector_test_ogrsf():
+
+    import test_cli_utilities
+
+    if test_cli_utilities.get_test_ogrsf_path() is None:
+        pytest.skip()
+
+    ret = gdaltest.runexternal(
+        test_cli_utilities.get_test_ogrsf_path() + " -ro AIVector:foo.bin"
+    )
+
+    assert "INFO" in ret
+    assert "ERROR" not in ret
diff --git a/doc/source/drivers/vector/aivector.rst b/doc/source/drivers/vector/aivector.rst
new file mode 100644
index 000000000000..41c230f45dd2
--- /dev/null
+++ b/doc/source/drivers/vector/aivector.rst
@@ -0,0 +1,32 @@
+.. _vector.aivector:
+
+Artificial intelligence powered vector driver
+=============================================
+
+.. versionadded:: 3.11
+
+.. shortname:: AIVector
+
+.. built_in_by_default::
+
+This driver builds on many years of self-funded investments from the GDAL team on AI
+technologies to bring you the ultimate driver that can read any vector format.
+After that one, no need for any new vector driver!
+
+The open syntax is ``AIVector:{filename}``, or directly specify the filename and
+force the use of the AIVector driver with the ``-if`` flag of ogrinfo or ogr2ogr.
+No options at all. Just enjoy the true power of AI.
+
+.. note:: We are open to external investors to develop the write side of the driver.
+
+Examples
+--------
+
+::
+
+  ogrinfo -if AIVector undocumented_proprietary_format.bin -al
+
+.. note::
+
+    The above works even if you make a typo in the filename. The driver will
+    automatically figure out the filename you meant.
diff --git a/doc/source/drivers/vector/index.rst b/doc/source/drivers/vector/index.rst
index 859bba292217..411189072692 100644
--- a/doc/source/drivers/vector/index.rst
+++ b/doc/source/drivers/vector/index.rst
@@ -23,6 +23,7 @@ Vector drivers
    :hidden:
 
    adbc
+   aivector
    amigocloud
    arrow
    avcbin
diff --git a/frmts/drivers.ini b/frmts/drivers.ini
index ec0ade0b2fff..ccf6bf0aca40 100644
--- a/frmts/drivers.ini
+++ b/frmts/drivers.ini
@@ -273,6 +273,9 @@ Tiger
 AVCBin
 AVCE00
 
+# Last but not the least
+AIVector
+
 # End of OGR drivers
 
 # Put here drivers that absolutely need to look for side car
diff --git a/ogr/ogrsf_frmts/CMakeLists.txt b/ogr/ogrsf_frmts/CMakeLists.txt
index d9337012cae0..15fc491e7665 100644
--- a/ogr/ogrsf_frmts/CMakeLists.txt
+++ b/ogr/ogrsf_frmts/CMakeLists.txt
@@ -90,6 +90,8 @@ if( NOT WORDS_BIGENDIAN )
     ogr_optional_driver(miramon "MiraMonVector")
 endif()
 
+ogr_optional_driver(aivector AIVector)
+
 # ######################################################################################################################
 #
 if (NOT OGR_ENABLE_DRIVER_GEOJSON_PLUGIN)
diff --git a/ogr/ogrsf_frmts/aivector/CMakeLists.txt b/ogr/ogrsf_frmts/aivector/CMakeLists.txt
new file mode 100644
index 000000000000..8b0b2ce48dac
--- /dev/null
+++ b/ogr/ogrsf_frmts/aivector/CMakeLists.txt
@@ -0,0 +1,9 @@
+add_gdal_driver(
+  TARGET ogr_AIVector
+  SOURCES
+      ograivectordriver.cpp
+  PLUGIN_CAPABLE
+  NO_DEPS
+  STRONG_CXX_WFLAGS)
+
+gdal_standard_includes(ogr_AIVector)
diff --git a/ogr/ogrsf_frmts/aivector/ograivectordriver.cpp b/ogr/ogrsf_frmts/aivector/ograivectordriver.cpp
new file mode 100644
index 000000000000..6606311fffbc
--- /dev/null
+++ b/ogr/ogrsf_frmts/aivector/ograivectordriver.cpp
@@ -0,0 +1,146 @@
+/******************************************************************************
+ *
+ * Project:  GDAL
+ * Purpose:  Artificial Intelligence powered driver
+ * Author:   Even Rouault, <even dot rouault at spatialys.com>
+ *
+ ******************************************************************************
+ * Copyright (c) 2024, Even Rouault <even dot rouault at spatialys.com>
+ *
+ * SPDX-License-Identifier: MIT
+ ****************************************************************************/
+
+#include "ogrsf_frmts.h"
+
+/************************************************************************/
+/*                       OGRAIVectorIdentify()                          */
+/************************************************************************/
+
+static int OGRAIVectorIdentify(GDALOpenInfo *poOpenInfo)
+{
+    return STARTS_WITH_CI(poOpenInfo->pszFilename, "AIVector:") ||
+           poOpenInfo->IsSingleAllowedDriver("AIVector");
+}
+
+/************************************************************************/
+/*                         OGRAIVectorOpen()                            */
+/************************************************************************/
+
+static GDALDataset *OGRAIVectorOpen(GDALOpenInfo *poOpenInfo)
+{
+    if (!OGRAIVectorIdentify(poOpenInfo))
+        return nullptr;
+
+    class MyLayer final : public OGRLayer,
+                          public OGRGetNextFeatureThroughRaw<MyLayer>
+    {
+        OGRFeatureDefn *m_poLayerDefn = nullptr;
+        bool m_bReturnedFeature = false;
+
+        CPL_DISALLOW_COPY_ASSIGN(MyLayer)
+
+      public:
+        MyLayer()
+        {
+            m_poLayerDefn = new OGRFeatureDefn("result");
+            SetDescription(m_poLayerDefn->GetName());
+            m_poLayerDefn->Reference();
+            OGRFieldDefn oFieldDefn("name", OFTString);
+            m_poLayerDefn->AddFieldDefn(&oFieldDefn);
+            OGRSpatialReference *poSRS = new OGRSpatialReference(
+                "GEOGCS[\"I don't know\",\n"
+                "    DATUM[\"I don't care\",\n"
+                "        SPHEROID[\"GRS 1980\",6378137,298.257222101,\n"
+                "            AUTHORITY[\"EPSG\",\"7019\"]]],\n"
+                "    PRIMEM[\"Greenwich\",0,\n"
+                "        AUTHORITY[\"EPSG\",\"8901\"]],\n"
+                "    UNIT[\"degree\",0.0174532925199433,\n"
+                "        AUTHORITY[\"EPSG\",\"9122\"]],\n"
+                "    AUTHORITY[\"AI\",\"TOTALLY_MADE_UP\"]]");
+            m_poLayerDefn->GetGeomFieldDefn(0)->SetSpatialRef(poSRS);
+            poSRS->Release();
+        }
+
+        ~MyLayer() override
+        {
+            m_poLayerDefn->Release();
+        }
+
+        void ResetReading() override
+        {
+            m_bReturnedFeature = false;
+        }
+
+        OGRFeatureDefn *GetLayerDefn() override
+        {
+            return m_poLayerDefn;
+        }
+        DEFINE_GET_NEXT_FEATURE_THROUGH_RAW(MyLayer)
+
+        OGRFeature *GetNextRawFeature()
+        {
+            if (m_bReturnedFeature)
+                return nullptr;
+            m_bReturnedFeature = true;
+            OGRFeature *poFeature = new OGRFeature(m_poLayerDefn);
+            poFeature->SetFID(0);
+            poFeature->SetField(0, "Null Island: the place to be");
+            OGRPoint *poPoint = new OGRPoint(0, 0);
+            poPoint->assignSpatialReference(GetSpatialRef());
+            poFeature->SetGeometryDirectly(poPoint);
+            return poFeature;
+        }
+
+        int TestCapability(const char *) override
+        {
+            return false;
+        }
+    };
+
+    class MyDataset final : public GDALDataset
+    {
+        MyLayer m_oLayer{};
+
+      public:
+        MyDataset() = default;
+
+        int GetLayerCount() override
+        {
+            return 1;
+        }
+
+        OGRLayer *GetLayer(int idx) override
+        {
+            return idx == 0 ? &m_oLayer : nullptr;
+        }
+    };
+
+    return new MyDataset();
+}
+
+/************************************************************************/
+/*                       RegisterOGRAIVector()                          */
+/************************************************************************/
+
+void RegisterOGRAIVector()
+{
+    if (!GDAL_CHECK_VERSION("AIVector"))
+        return;
+
+    if (GDALGetDriverByName("AIVector") != nullptr)
+        return;
+
+    GDALDriver *poDriver = new GDALDriver();
+    poDriver->SetDescription("AIVector");
+    poDriver->SetMetadataItem(GDAL_DCAP_VECTOR, "YES");
+    poDriver->SetMetadataItem(GDAL_DMD_LONGNAME,
+                              "Artificial Intelligence powered vector driver");
+    poDriver->SetMetadataItem(GDAL_DMD_HELPTOPIC,
+                              "drivers/vector/aivector.html");
+
+    poDriver->SetMetadataItem(GDAL_DMD_CONNECTION_PREFIX, "AIVector:");
+
+    poDriver->pfnOpen = OGRAIVectorOpen;
+    poDriver->pfnIdentify = OGRAIVectorIdentify;
+    GetGDALDriverManager()->RegisterDriver(poDriver);
+}
diff --git a/ogr/ogrsf_frmts/generic/ogrregisterall.cpp b/ogr/ogrsf_frmts/generic/ogrregisterall.cpp
index b7d6744daea6..843197c6dac5 100644
--- a/ogr/ogrsf_frmts/generic/ogrregisterall.cpp
+++ b/ogr/ogrsf_frmts/generic/ogrregisterall.cpp
@@ -273,4 +273,9 @@ void OGRRegisterAllInternal()
     RegisterOGRAVCE00();
 #endif
 
+    // Last but not the least
+#ifdef AIVECTOR_ENABLED
+    RegisterOGRAIVector();
+#endif
+
 } /* OGRRegisterAll */
diff --git a/ogr/ogrsf_frmts/ogrsf_frmts.h b/ogr/ogrsf_frmts/ogrsf_frmts.h
index 994db366a89c..e7f28b1fb155 100644
--- a/ogr/ogrsf_frmts/ogrsf_frmts.h
+++ b/ogr/ogrsf_frmts/ogrsf_frmts.h
@@ -741,6 +741,7 @@ void CPL_DLL RegisterOGRXODR();
 void DeclareDeferredOGRXODRPlugin();
 void CPL_DLL RegisterOGRADBC();
 void DeclareDeferredOGRADBCPlugin();
+void CPL_DLL RegisterOGRAIVector();
 // @endcond
 
 CPL_C_END

From 6966463cf1ebfbc55f5227e72c79e5988e42f84a Mon Sep 17 00:00:00 2001
From: Even Rouault <even.rouault@spatialys.com>
Date: Sat, 9 Nov 2024 12:18:23 +0100
Subject: [PATCH 46/62] gdaldem: fix help message for subcommands

Now:
```
$ gdaldem TPI --help
Usage: gdaldem TPI [--help] [--long-usage] [--help-general]
                   [-of <output_format>] [-compute_edges] [-b <value>] [-co <NAME>=<VALUE>]... [--quiet]
                   input_dem output_TPI_map

Note: gdaldem TPI --long-usage for full help.
```

vs before:
```
$ gdaldem TPI --help
Usage: TPI [--help] [--long-usage] [--help-general]
           [-of <output_format>] [-compute_edges] [-b <value>] [-co <NAME>=<VALUE>]... [--quiet]
           input_dem output_TPI_map

Note: TPI --long-usage for full help.
```

argparse.hpp stream fix submitted to upstream in https://github.com/p-ranav/argparse/pull/382
---
 apps/argparse/argparse.hpp  | 2 +-
 apps/gdalargumentparser.cpp | 2 +-
 2 files changed, 2 insertions(+), 2 deletions(-)

diff --git a/apps/argparse/argparse.hpp b/apps/argparse/argparse.hpp
index a52142848192..029fa03c2c42 100644
--- a/apps/argparse/argparse.hpp
+++ b/apps/argparse/argparse.hpp
@@ -2086,7 +2086,7 @@ class ArgumentParser {
     std::stringstream stream;
 
     std::string curline("Usage: ");
-    curline += this->m_program_name;
+    curline += this->m_parser_path;
     const bool multiline_usage =
         this->m_usage_max_line_width < std::numeric_limits<std::size_t>::max();
     const size_t indent_size = curline.size();
diff --git a/apps/gdalargumentparser.cpp b/apps/gdalargumentparser.cpp
index 724dcd91957b..3b795325345b 100644
--- a/apps/gdalargumentparser.cpp
+++ b/apps/gdalargumentparser.cpp
@@ -37,7 +37,7 @@ GDALArgumentParser::GDALArgumentParser(const std::string &program_name,
                 [this](const auto &)
                 {
                     std::cout << usage() << std::endl << std::endl;
-                    std::cout << _("Note: ") << m_program_name
+                    std::cout << _("Note: ") << m_parser_path
                               << _(" --long-usage for full help.") << std::endl;
                     std::exit(0);
                 })

From 9d49899e655dd2e4b8a42792afa95309d6aef6f1 Mon Sep 17 00:00:00 2001
From: Even Rouault <even.rouault@spatialys.com>
Date: Sat, 9 Nov 2024 13:44:41 +0100
Subject: [PATCH 47/62] RCM: fix various crasher bugs on corrupted files, and
 other minor issues found by Coverity Scan (master only)

---
 frmts/rcm/rcmdataset.cpp | 138 +++++++++++++++++++++------------------
 1 file changed, 74 insertions(+), 64 deletions(-)

diff --git a/frmts/rcm/rcmdataset.cpp b/frmts/rcm/rcmdataset.cpp
index 087521b98345..a84746d955db 100644
--- a/frmts/rcm/rcmdataset.cpp
+++ b/frmts/rcm/rcmdataset.cpp
@@ -457,7 +457,8 @@ void RCMCalibRasterBand::ReadLUT()
     }
 
     /* Get the Pixel Per range */
-    if (this->stepSize == INT_MIN || this->numberOfValues == INT_MIN ||
+    if (this->stepSize == 0 || this->stepSize == INT_MIN ||
+        this->numberOfValues == INT_MIN ||
         abs(this->stepSize) > INT_MAX / abs(this->numberOfValues))
     {
         CPLError(CE_Failure, CPLE_AppDefined,
@@ -588,29 +589,35 @@ void RCMCalibRasterBand::ReadNoiseLevels()
                     atoi(CPLGetXMLValue(psNumberOfValues, "", "0"));
                 const char *noiseLevelValues =
                     CPLGetXMLValue(psNoiseLevelValues, "", "");
-                char **papszNoiseLevelList = CSLTokenizeString2(
-                    noiseLevelValues, " ", CSLT_HONOURSTRINGS);
-                /* Get the Pixel Per range */
-                this->m_nTableNoiseLevelsSize =
-                    abs(this->stepSizeNoiseLevels) *
-                    abs(this->numberOfValuesNoiseLevels);
-
-                if ((EQUAL(calibType, "Beta Nought") &&
-                     this->m_eCalib == Beta0) ||
-                    (EQUAL(calibType, "Sigma Nought") &&
-                     this->m_eCalib == Sigma0) ||
-                    (EQUAL(calibType, "Gamma") && this->m_eCalib == Gamma))
+                if (this->stepSizeNoiseLevels > 0 &&
+                    this->numberOfValuesNoiseLevels != INT_MIN &&
+                    abs(this->numberOfValuesNoiseLevels) <
+                        INT_MAX / this->stepSizeNoiseLevels)
                 {
-                    /* Allocate the right Noise Levels size according to the
-                     * product range pixel */
-                    this->m_nfTableNoiseLevels = InterpolateValues(
-                        papszNoiseLevelList, this->m_nTableNoiseLevelsSize,
-                        this->stepSizeNoiseLevels,
-                        this->numberOfValuesNoiseLevels,
-                        this->pixelFirstLutValueNoiseLevels);
-                }
+                    char **papszNoiseLevelList = CSLTokenizeString2(
+                        noiseLevelValues, " ", CSLT_HONOURSTRINGS);
+                    /* Get the Pixel Per range */
+                    this->m_nTableNoiseLevelsSize =
+                        abs(this->stepSizeNoiseLevels) *
+                        abs(this->numberOfValuesNoiseLevels);
+
+                    if ((EQUAL(calibType, "Beta Nought") &&
+                         this->m_eCalib == Beta0) ||
+                        (EQUAL(calibType, "Sigma Nought") &&
+                         this->m_eCalib == Sigma0) ||
+                        (EQUAL(calibType, "Gamma") && this->m_eCalib == Gamma))
+                    {
+                        /* Allocate the right Noise Levels size according to the
+                         * product range pixel */
+                        this->m_nfTableNoiseLevels = InterpolateValues(
+                            papszNoiseLevelList, this->m_nTableNoiseLevelsSize,
+                            this->stepSizeNoiseLevels,
+                            this->numberOfValuesNoiseLevels,
+                            this->pixelFirstLutValueNoiseLevels);
+                    }
 
-                CSLDestroy(papszNoiseLevelList);
+                    CSLDestroy(papszNoiseLevelList);
+                }
 
                 if (this->m_nfTableNoiseLevels != nullptr)
                 {
@@ -1582,47 +1589,52 @@ GDALDataset *RCMDataset::Open(GDALOpenInfo *poOpenInfo)
                 CPLGetXMLValue(psIncidenceAngle.get(),
                                "=incidenceAngles.pixelFirstAnglesValue", "0"));
 
-            int stepSize = atoi(CPLGetXMLValue(
+            const int stepSize = atoi(CPLGetXMLValue(
                 psIncidenceAngle.get(), "=incidenceAngles.stepSize", "0"));
-
-            int numberOfValues =
+            const int numberOfValues =
                 atoi(CPLGetXMLValue(psIncidenceAngle.get(),
                                     "=incidenceAngles.numberOfValues", "0"));
 
-            /* Get the Pixel Per range */
-            int tableSize = abs(stepSize) * abs(numberOfValues);
+            if (!(stepSize == 0 || stepSize == INT_MIN ||
+                  numberOfValues == INT_MIN ||
+                  abs(numberOfValues) > INT_MAX / abs(stepSize)))
+            {
+                /* Get the Pixel Per range */
+                const int tableSize = abs(stepSize) * abs(numberOfValues);
 
-            CPLString angles;
-            // Loop through all nodes with spaces
-            CPLXMLNode *psNextNode =
-                CPLGetXMLNode(psIncidenceAngle.get(), "=incidenceAngles");
+                CPLString angles;
+                // Loop through all nodes with spaces
+                CPLXMLNode *psNextNode =
+                    CPLGetXMLNode(psIncidenceAngle.get(), "=incidenceAngles");
 
-            CPLXMLNode *psNodeInc;
-            for (psNodeInc = psNextNode->psChild; psNodeInc != nullptr;
-                 psNodeInc = psNodeInc->psNext)
-            {
-                if (EQUAL(psNodeInc->pszValue, "angles"))
+                CPLXMLNode *psNodeInc;
+                for (psNodeInc = psNextNode->psChild; psNodeInc != nullptr;
+                     psNodeInc = psNodeInc->psNext)
                 {
-                    if (angles.length() > 0)
+                    if (EQUAL(psNodeInc->pszValue, "angles"))
                     {
-                        angles.append(" "); /* separator */
+                        if (angles.length() > 0)
+                        {
+                            angles.append(" "); /* separator */
+                        }
+                        const char *valAngle =
+                            CPLGetXMLValue(psNodeInc, "", "");
+                        angles.append(valAngle);
                     }
-                    const char *valAngle = CPLGetXMLValue(psNodeInc, "", "");
-                    angles.append(valAngle);
                 }
-            }
 
-            char **papszAngleList =
-                CSLTokenizeString2(angles, " ", CSLT_HONOURSTRINGS);
+                char **papszAngleList =
+                    CSLTokenizeString2(angles, " ", CSLT_HONOURSTRINGS);
 
-            /* Allocate the right LUT size according to the product range pixel
-             */
-            poDS->m_IncidenceAngleTableSize = tableSize;
-            poDS->m_nfIncidenceAngleTable =
-                InterpolateValues(papszAngleList, tableSize, stepSize,
-                                  numberOfValues, pixelFirstLutValue);
+                /* Allocate the right LUT size according to the product range pixel
+                 */
+                poDS->m_IncidenceAngleTableSize = tableSize;
+                poDS->m_nfIncidenceAngleTable =
+                    InterpolateValues(papszAngleList, tableSize, stepSize,
+                                      numberOfValues, pixelFirstLutValue);
 
-            CSLDestroy(papszAngleList);
+                CSLDestroy(papszAngleList);
+            }
         }
     }
 
@@ -1962,6 +1974,12 @@ GDALDataset *RCMDataset::Open(GDALOpenInfo *poOpenInfo)
                     /* we should bomb gracefully... */
                     pszLUT = pszSigma0LUT;
             }
+            if (!pszLUT)
+            {
+                CPLFree(pszFullname);
+                CPLError(CE_Failure, CPLE_AppDefined, "LUT missing.");
+                return nullptr;
+            }
 
             // The variable 'osNoiseLevelsValues' is always the same for a ban
             // name except the XML contains different calibration name
@@ -2310,7 +2328,7 @@ GDALDataset *RCMDataset::Open(GDALOpenInfo *poOpenInfo)
 
             if (bUseProjInfo)
             {
-                poDS->m_oSRS = oPrj;
+                poDS->m_oSRS = std::move(oPrj);
             }
             else
             {
@@ -2320,7 +2338,7 @@ GDALDataset *RCMDataset::Open(GDALOpenInfo *poOpenInfo)
             }
         }
 
-        poDS->m_oGCPSRS = oLL;
+        poDS->m_oGCPSRS = std::move(oLL);
     }
 
     /* -------------------------------------------------------------------- */
@@ -2448,33 +2466,25 @@ GDALDataset *RCMDataset::Open(GDALOpenInfo *poOpenInfo)
         case Sigma0:
         {
             osSubdatasetName = szSIGMA0;
-            CPLString pszDescriptionSigma =
-                FormatCalibration(szSIGMA0, osMDFilename.c_str());
-            osDescription = pszDescriptionSigma;
+            osDescription = FormatCalibration(szSIGMA0, osMDFilename.c_str());
         }
         break;
         case Beta0:
         {
             osSubdatasetName = szBETA0;
-            CPLString pszDescriptionBeta =
-                FormatCalibration(szBETA0, osMDFilename.c_str());
-            osDescription = pszDescriptionBeta;
+            osDescription = FormatCalibration(szBETA0, osMDFilename.c_str());
         }
         break;
         case Gamma:
         {
             osSubdatasetName = szGAMMA;
-            CPLString pszDescriptionGamma =
-                FormatCalibration(szGAMMA, osMDFilename.c_str());
-            osDescription = pszDescriptionGamma;
+            osDescription = FormatCalibration(szGAMMA, osMDFilename.c_str());
         }
         break;
         case Uncalib:
         {
             osSubdatasetName = szUNCALIB;
-            CPLString pszDescriptionUncalib =
-                FormatCalibration(szUNCALIB, osMDFilename.c_str());
-            osDescription = pszDescriptionUncalib;
+            osDescription = FormatCalibration(szUNCALIB, osMDFilename.c_str());
         }
         break;
         default:

From bb64609eded76efd70865a521780aec68146a440 Mon Sep 17 00:00:00 2001
From: Even Rouault <even.rouault@spatialys.com>
Date: Sat, 9 Nov 2024 17:16:28 +0100
Subject: [PATCH 48/62] autotest: fix test failure with msys2 mingw64

A recent update of msys2 mingw (presumably Python 3.11 -> 3.12) causes
the test_gnm_filenetwork_open() test to fail when run just after
test_gnm_filenetwork_create(), presumably because the just created
dataset has not been properly closed.
Cf https://github.com/OSGeo/gdal/actions/runs/11755814302/job/32752542561?pr=11224
---
 autotest/gnm/gnm_test.py | 22 ++++++++++------------
 1 file changed, 10 insertions(+), 12 deletions(-)

diff --git a/autotest/gnm/gnm_test.py b/autotest/gnm/gnm_test.py
index 530d2db8fdfd..f82cbaf71270 100755
--- a/autotest/gnm/gnm_test.py
+++ b/autotest/gnm/gnm_test.py
@@ -36,7 +36,7 @@ def test_gnm_filenetwork_create():
         pass
 
     drv = gdal.GetDriverByName("GNMFile")
-    ds = drv.Create(
+    with drv.Create(
         "tmp/",
         0,
         0,
@@ -47,17 +47,15 @@ def test_gnm_filenetwork_create():
             "net_description=Test file based GNM",
             "net_srs=EPSG:4326",
         ],
-    )
-    # cast to GNM
-    dn = gnm.CastToNetwork(ds)
-    assert dn is not None
-    assert dn.GetVersion() == 100, "GNM: Check GNM version failed"
-    assert dn.GetName() == "test_gnm", "GNM: Check GNM name failed"
-    assert (
-        dn.GetDescription() == "Test file based GNM"
-    ), "GNM: Check GNM description failed"
-
-    dn = None
+    ) as ds:
+        # cast to GNM
+        dn = gnm.CastToNetwork(ds)
+        assert dn is not None
+        assert dn.GetVersion() == 100, "GNM: Check GNM version failed"
+        assert dn.GetName() == "test_gnm", "GNM: Check GNM name failed"
+        assert (
+            dn.GetDescription() == "Test file based GNM"
+        ), "GNM: Check GNM description failed"
 
 
 ###############################################################################

From 4b5788629a372704faebf457ed8d10935144910f Mon Sep 17 00:00:00 2001
From: Even Rouault <even.rouault@spatialys.com>
Date: Sun, 10 Nov 2024 00:11:56 +0100
Subject: [PATCH 49/62] cmake/modules/thirdparty/FindDotnet.cmake: remove
 obsolete cmake_minimum_required()

---
 cmake/modules/thirdparty/FindDotnet.cmake | 69 +++++++++++------------
 1 file changed, 33 insertions(+), 36 deletions(-)

diff --git a/cmake/modules/thirdparty/FindDotnet.cmake b/cmake/modules/thirdparty/FindDotnet.cmake
index 945688b6eb47..499634228d63 100644
--- a/cmake/modules/thirdparty/FindDotnet.cmake
+++ b/cmake/modules/thirdparty/FindDotnet.cmake
@@ -5,24 +5,24 @@
 #
 # FindDotnet
 # ----------
-# 
+#
 # Find DotNet executable, and initialize functions for adding dotnet projects.
-# 
+#
 # Results are reported in the following variables::
-# 
+#
 #   DOTNET_FOUND          - True if dotnet executable is found
 #   DOTNET_EXE            - Dotnet executable
 #   DOTNET_VERSION        - Dotnet version as reported by dotnet executable
 #   DOTNET_SDKS           - Dotnet SDKs loaded as reported by dotnet executable
 #   NUGET_EXE             - Nuget executable (WIN32 only)
 #   NUGET_CACHE_PATH      - Nuget package cache path
-# 
+#
 # The following functions are defined to add dotnet/msbuild projects:
-# 
+#
 # ADD_DOTNET -- add a project to be built by dotnet.
-# 
+#
 # ```
-# ADD_DOTNET(<project_file> [RELEASE|DEBUG] [X86|X64|ANYCPU] 
+# ADD_DOTNET(<project_file> [RELEASE|DEBUG] [X86|X64|ANYCPU]
 #            [CONFIG configuration]
 #            [PLATFORM platform]
 #            [PACKAGE nuget_package_dependencies... ]
@@ -34,12 +34,12 @@
 #            [ARGUMENTS additional_build_args...]
 #            [PACK_ARGUMENTS additional_pack_args...])
 # ```
-# 
-# RUN_DOTNET -- Run a project with `dotnet run`. The `OUTPUT` argument represents artifacts 
+#
+# RUN_DOTNET -- Run a project with `dotnet run`. The `OUTPUT` argument represents artifacts
 #               produced by running the .NET program, and can be consumed from other build steps.
-# 
+#
 # ```
-# RUN_DOTNET(<project_file> [RELEASE|DEBUG] [X86|X64|ANYCPU] 
+# RUN_DOTNET(<project_file> [RELEASE|DEBUG] [X86|X64|ANYCPU]
 #            [ARGUMENTS program_args...]
 #            [OUTPUT outputs...]
 #            [CONFIG configuration]
@@ -49,11 +49,11 @@
 #            [CUSTOM_BUILDPROPS <CustomProp>value</CustomProp>....]
 #            [SOURCES additional_file_dependencies... ])
 # ```
-# 
+#
 # ADD_MSBUILD -- add a project to be built by msbuild. Windows-only. When building in Unix systems, msbuild targets are skipped.
-# 
+#
 # ```
-# ADD_MSBUILD(<project_file> [RELEASE|DEBUG] [X86|X64|ANYCPU] 
+# ADD_MSBUILD(<project_file> [RELEASE|DEBUG] [X86|X64|ANYCPU]
 #            [CONFIG configuration]
 #            [PLATFORM platform]
 #            [PACKAGE output_nuget_packages... ]
@@ -68,7 +68,7 @@
 # and if the program fails to build or run, the build fails. Currently only .NET Core App framework is supported.
 # Multiple smoke tests will be run one-by-one to avoid global resource conflicts.
 #
-# SMOKETEST_DOTNET(<project_file> [RELEASE|DEBUG] [X86|X64|ANYCPU] 
+# SMOKETEST_DOTNET(<project_file> [RELEASE|DEBUG] [X86|X64|ANYCPU]
 #                 [ARGUMENTS program_args...]
 #                 [CONFIG configuration]
 #                 [PLATFORM platform]
@@ -76,12 +76,12 @@
 #                 [OUTPUT_PATH output_path relative to cmake binary output dir]
 #                 [CUSTOM_BUILDPROPS <CustomProp>value</CustomProp>....]
 #                 [SOURCES additional_file_dependencies... ])
-# 
+#
 # For all the above functions, `RELEASE|DEBUG` overrides `CONFIG`, `X86|X64|ANYCPU` overrides PLATFORM.
 #
 #
 # DOTNET_REGISTER_LOCAL_REPOSITORY -- register a local NuGet package repository.
-# 
+#
 # ```
 # DOTNET_REGISTER_LOCAL_REPOSITORY(repo_name repo_path)
 # ```
@@ -97,7 +97,7 @@
 #             [ARGUMENTS additional_dotnet_test_args...]
 #             [OUTPUT_PATH output_path relative to cmake binary output dir])
 # ```
-# 
+#
 # GEN_DOTNET_PROPS -- Generates a Directory.Build.props file. The created file is populated with MSBuild properties:
 #  - DOTNET_PACKAGE_VERSION: a version string that can be referenced in the actual project file as $(DOTNET_PACKAGE_VERSION).
 #    The version string value can be set with PACKAGE_VERSION argument, and defaults to '1.0.0'.
@@ -111,10 +111,7 @@
 #                  [PACKAGE_VERSION version]
 #                  [XML_INJECT xml_injection])
 # ```
-# 
-# Require 3.5 for batch copy multiple files
-
-cmake_minimum_required(VERSION 3.5.0)
+#
 
 IF(DOTNET_FOUND)
     RETURN()
@@ -184,11 +181,11 @@ ENDFUNCTION()
 FUNCTION(DOTNET_GET_DEPS _DN_PROJECT arguments)
     CMAKE_PARSE_ARGUMENTS(
         # prefix
-        _DN 
+        _DN
         # options (flags)
-        "RELEASE;DEBUG;X86;X64;ANYCPU;NETCOREAPP" 
+        "RELEASE;DEBUG;X86;X64;ANYCPU;NETCOREAPP"
         # oneValueArgs
-        "NAME;CONFIG;PLATFORM;VERSION;OUTPUT_PATH" 
+        "NAME;CONFIG;PLATFORM;VERSION;OUTPUT_PATH"
         # multiValueArgs
         "PACKAGE;DEPENDS;ARGUMENTS;PACK_ARGUMENTS;OUTPUT;SOURCES;CUSTOM_BUILDPROPS,BUILD_OPTIONS"
         # the input arguments
@@ -199,7 +196,7 @@ FUNCTION(DOTNET_GET_DEPS _DN_PROJECT arguments)
     GET_FILENAME_COMPONENT(_DN_projname "${_DN_PROJECT}" NAME)
     STRING(REGEX REPLACE "\\.[^.]*$" "" _DN_projname_noext ${_DN_projname})
 
-    FILE(GLOB_RECURSE DOTNET_deps 
+    FILE(GLOB_RECURSE DOTNET_deps
         ${_DN_proj_dir}/*.cs
         ${_DN_proj_dir}/*.fs
         ${_DN_proj_dir}/*.vb
@@ -328,14 +325,14 @@ ENDMACRO()
 
 MACRO(DOTNET_BUILD_COMMANDS)
     IF(${DOTNET_IS_MSBUILD})
-        SET(build_dotnet_cmds 
+        SET(build_dotnet_cmds
             COMMAND ${CMAKE_COMMAND} -E echo "======= Building msbuild project ${DOTNET_PROJNAME} [${DOTNET_CONFIG} ${DOTNET_PLATFORM}]"
             COMMAND ${NUGET_EXE} restore -Force ${DOTNET_PROJPATH}
             COMMAND ${DOTNET_EXE} msbuild ${DOTNET_PROJPATH} /t:Clean ${DOTNET_BUILD_PROPERTIES} /p:Configuration="${DOTNET_CONFIG}"
             COMMAND ${DOTNET_EXE} msbuild ${DOTNET_PROJPATH} /t:Build ${DOTNET_BUILD_PROPERTIES} /p:Configuration="${DOTNET_CONFIG}" ${DOTNET_ARGUMENTS})
         SET(build_dotnet_type "msbuild")
     ELSE()
-        SET(build_dotnet_cmds 
+        SET(build_dotnet_cmds
             COMMAND ${CMAKE_COMMAND} -E echo "======= Building .NET project ${DOTNET_PROJNAME} [${DOTNET_CONFIG} ${DOTNET_PLATFORM}]")
         foreach (_src ${DOTNET_SOURCES} )
             LIST(APPEND build_dotnet_cmds COMMAND ${DOTNET_EXE} add ${DOTNET_PROJPATH} reference ${_src})
@@ -362,10 +359,10 @@ MACRO(DOTNET_BUILD_COMMANDS)
             MESSAGE("-- Adding ${build_dotnet_type} project ${DOTNET_PROJPATH} (no nupkg)")
         ENDIF()
     endif()
-    
-    LIST(APPEND build_dotnet_cmds COMMAND ${DOTNET_EXE} pack 
-                        --no-build --no-restore ${DOTNET_PROJPATH} 
-                        -c ${DOTNET_CONFIG} ${DOTNET_BUILD_PROPERTIES} ${DOTNET_PACK_OPTIONS} 
+
+    LIST(APPEND build_dotnet_cmds COMMAND ${DOTNET_EXE} pack
+                        --no-build --no-restore ${DOTNET_PROJPATH}
+                        -c ${DOTNET_CONFIG} ${DOTNET_BUILD_PROPERTIES} ${DOTNET_PACK_OPTIONS}
                         --output ${CMAKE_CURRENT_BINARY_DIR} -p:PackageVersion=${DOTNET_PACKAGE_VERSION} )
     LIST(APPEND DOTNET_OUTPUTS ${CMAKE_CURRENT_BINARY_DIR}/${DOTNET_PROJNAME}.buildtimestamp)
     LIST(APPEND build_dotnet_cmds COMMAND ${CMAKE_COMMAND} -E touch ${CMAKE_CURRENT_BINARY_DIR}/${DOTNET_PROJNAME}.buildtimestamp)
@@ -416,7 +413,7 @@ FUNCTION(RUN_DOTNET DOTNET_PROJECT)
         COMMAND ${CMAKE_COMMAND} -E touch ${CMAKE_CURRENT_BINARY_DIR}/${DOTNET_PROJNAME}.runtimestamp
         WORKING_DIRECTORY ${DOTNET_OUTPUT_PATH})
     ADD_CUSTOM_TARGET(
-        ${DOTNET_PROJNAME} 
+        ${DOTNET_PROJNAME}
         DEPENDS ${CMAKE_CURRENT_BINARY_DIR}/${DOTNET_PROJNAME}.runtimestamp ${DOTNET_RUN_OUTPUT})
     ADD_DOTNET_DEPENDENCY_TARGETS()
 ENDFUNCTION()
@@ -469,9 +466,9 @@ FUNCTION(GEN_DOTNET_PROPS target_props_file)
         # prefix
         _DNP
         # options (flags)
-        "" 
+        ""
         # oneValueArgs
-        "PACKAGE_VERSION;XML_INJECT" 
+        "PACKAGE_VERSION;XML_INJECT"
         # multiValueArgs
         ""
         # the input arguments
@@ -496,4 +493,4 @@ ENDFUNCTION()
 
 
 MESSAGE("-- Found .NET toolchain: ${DOTNET_EXE} (version ${DOTNET_VERSION})")
-SET(DOTNET_FOUND TRUE)
\ No newline at end of file
+SET(DOTNET_FOUND TRUE)

From 720deb86d4c49b727829c2744cc96f406dfb3739 Mon Sep 17 00:00:00 2001
From: Even Rouault <even.rouault@spatialys.com>
Date: Sat, 9 Nov 2024 22:24:42 +0100
Subject: [PATCH 50/62] autotest/cpp/googletest/CMakeLists.txt.in: bump minimum
 cmake_minimum_required() to avoid warnings about CMake 3.10 support being
 soon removed with newer CMake

---
 autotest/cpp/googletest/CMakeLists.txt.in | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/autotest/cpp/googletest/CMakeLists.txt.in b/autotest/cpp/googletest/CMakeLists.txt.in
index 1cdcf819370b..2d48d04cdc40 100644
--- a/autotest/cpp/googletest/CMakeLists.txt.in
+++ b/autotest/cpp/googletest/CMakeLists.txt.in
@@ -1,5 +1,5 @@
 # Source https://github.com/google/googletest/blob/master/googletest/README.md
-cmake_minimum_required(VERSION 3.9)
+cmake_minimum_required(VERSION 3.16)
 
 project(googletest-download NONE)
 

From 566f90d41fd828a853b9ae3f70f041828b63236c Mon Sep 17 00:00:00 2001
From: Even Rouault <even.rouault@spatialys.com>
Date: Sun, 10 Nov 2024 03:48:25 +0100
Subject: [PATCH 51/62] autotest: fix memory leak

---
 autotest/cpp/test_ogr.cpp | 39 ++++++++++++++++++---------------------
 1 file changed, 18 insertions(+), 21 deletions(-)

diff --git a/autotest/cpp/test_ogr.cpp b/autotest/cpp/test_ogr.cpp
index b43f40d1cd65..a8a3b57f55a5 100644
--- a/autotest/cpp/test_ogr.cpp
+++ b/autotest/cpp/test_ogr.cpp
@@ -4241,9 +4241,8 @@ TEST_F(test_ogr, OGRCurve_reversePoints)
 TEST_F(test_ogr, transformWithOptions)
 {
     // Projected CRS to national geographic CRS (not including poles or antimeridian)
-    OGRGeometry *poGeom = nullptr;
-    OGRGeometryFactory::createFromWkt(
-        "LINESTRING(700000 6600000, 700001 6600001)", nullptr, &poGeom);
+    auto [poGeom, err] = OGRGeometryFactory::createFromWkt(
+        "LINESTRING(700000 6600000, 700001 6600001)");
     ASSERT_NE(poGeom, nullptr);
 
     OGRSpatialReference oEPSG_2154;
@@ -4254,12 +4253,12 @@ TEST_F(test_ogr, transformWithOptions)
     auto poCT = std::unique_ptr<OGRCoordinateTransformation>(
         OGRCreateCoordinateTransformation(&oEPSG_2154, &oEPSG_4171));
     OGRGeometryFactory::TransformWithOptionsCache oCache;
-    poGeom = OGRGeometryFactory::transformWithOptions(poGeom, poCT.get(),
-                                                      nullptr, oCache);
-    EXPECT_NEAR(poGeom->toLineString()->getX(0), 3, 1e-8);
-    EXPECT_NEAR(poGeom->toLineString()->getY(0), 46.5, 1e-8);
-
-    delete poGeom;
+    auto poNewGeom =
+        std::unique_ptr<OGRGeometry>(OGRGeometryFactory::transformWithOptions(
+            poGeom.get(), poCT.get(), nullptr, oCache));
+    ASSERT_NE(poNewGeom, nullptr);
+    EXPECT_NEAR(poNewGeom->toLineString()->getX(0), 3, 1e-8);
+    EXPECT_NEAR(poNewGeom->toLineString()->getY(0), 46.5, 1e-8);
 }
 
 #ifdef HAVE_GEOS
@@ -4268,10 +4267,8 @@ TEST_F(test_ogr, transformWithOptions)
 TEST_F(test_ogr, transformWithOptions_GEOS)
 {
     // Projected CRS to national geographic CRS including antimeridian
-    OGRGeometry *poGeom = nullptr;
-    OGRGeometryFactory::createFromWkt(
-        "LINESTRING(657630.64 4984896.17,815261.43 4990738.26)", nullptr,
-        &poGeom);
+    auto [poGeom, err] = OGRGeometryFactory::createFromWkt(
+        "LINESTRING(657630.64 4984896.17,815261.43 4990738.26)");
     ASSERT_NE(poGeom, nullptr);
 
     OGRSpatialReference oEPSG_6329;
@@ -4282,12 +4279,14 @@ TEST_F(test_ogr, transformWithOptions_GEOS)
     auto poCT = std::unique_ptr<OGRCoordinateTransformation>(
         OGRCreateCoordinateTransformation(&oEPSG_6329, &oEPSG_6318));
     OGRGeometryFactory::TransformWithOptionsCache oCache;
-    poGeom = OGRGeometryFactory::transformWithOptions(poGeom, poCT.get(),
-                                                      nullptr, oCache);
-    EXPECT_EQ(poGeom->getGeometryType(), wkbMultiLineString);
-    if (poGeom->getGeometryType() == wkbMultiLineString)
-    {
-        const auto poMLS = poGeom->toMultiLineString();
+    auto poNewGeom =
+        std::unique_ptr<OGRGeometry>(OGRGeometryFactory::transformWithOptions(
+            poGeom.get(), poCT.get(), nullptr, oCache));
+    ASSERT_NE(poNewGeom, nullptr);
+    EXPECT_EQ(poNewGeom->getGeometryType(), wkbMultiLineString);
+    if (poNewGeom->getGeometryType() == wkbMultiLineString)
+    {
+        const auto poMLS = poNewGeom->toMultiLineString();
         EXPECT_EQ(poMLS->getNumGeometries(), 2);
         if (poMLS->getNumGeometries() == 2)
         {
@@ -4302,8 +4301,6 @@ TEST_F(test_ogr, transformWithOptions_GEOS)
             }
         }
     }
-
-    delete poGeom;
 }
 #endif
 

From 37bec729c3bbbbf2098bcd4185ec9863a33d50d5 Mon Sep 17 00:00:00 2001
From: Even Rouault <even.rouault@spatialys.com>
Date: Sun, 10 Nov 2024 14:48:50 +0100
Subject: [PATCH 52/62] LVBAG: only run IsValid() if bFixInvalidData

This will speed up processing if not needing to fix invalid data.
May be related to https://lists.osgeo.org/pipermail/gdal-dev/2024-November/059794.html
---
 ogr/ogrsf_frmts/lvbag/ogrlvbaglayer.cpp | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/ogr/ogrsf_frmts/lvbag/ogrlvbaglayer.cpp b/ogr/ogrsf_frmts/lvbag/ogrlvbaglayer.cpp
index 796174cce9d1..b5550ee3f3f9 100644
--- a/ogr/ogrsf_frmts/lvbag/ogrlvbaglayer.cpp
+++ b/ogr/ogrsf_frmts/lvbag/ogrlvbaglayer.cpp
@@ -638,7 +638,7 @@ void OGRLVBAGLayer::EndElementCbk(const char *pszName)
                     poGeom->flattenTo2D();
 
 #ifdef HAVE_GEOS
-                if (!poGeom->IsValid() && bFixInvalidData)
+                if (bFixInvalidData && !poGeom->IsValid())
                 {
                     std::unique_ptr<OGRGeometry> poSubGeom =
                         std::unique_ptr<OGRGeometry>{poGeom->MakeValid()};

From 69d4dd3745b7af3557d7b0ab03f13eaddd430ad9 Mon Sep 17 00:00:00 2001
From: Even Rouault <even.rouault@spatialys.com>
Date: Sun, 10 Nov 2024 15:05:14 +0100
Subject: [PATCH 53/62] autotest: update internal googletest to 1.15.2 to avoid
 CMake warnings

---
 autotest/cpp/googletest/CMakeLists.txt.in | 4 ++--
 1 file changed, 2 insertions(+), 2 deletions(-)

diff --git a/autotest/cpp/googletest/CMakeLists.txt.in b/autotest/cpp/googletest/CMakeLists.txt.in
index 2d48d04cdc40..5055e6628fae 100644
--- a/autotest/cpp/googletest/CMakeLists.txt.in
+++ b/autotest/cpp/googletest/CMakeLists.txt.in
@@ -10,8 +10,8 @@ endif()
 
 include(ExternalProject)
 ExternalProject_Add(googletest
-  URL https://github.com/google/googletest/archive/release-1.12.1.zip
-  URL_HASH SHA1=973e464e8936d4b79bb24f27b058aaef4150b06e
+  URL https://github.com/google/googletest/releases/download/v1.15.2/googletest-1.15.2.tar.gz
+  URL_HASH SHA1=568d58e26bd4e838449ca7ab8ebc152b3cbd210d
   DOWNLOAD_NO_PROGRESS ON
   SOURCE_DIR        "${CMAKE_CURRENT_BINARY_DIR}/googletest-src"
   BINARY_DIR        "${CMAKE_CURRENT_BINARY_DIR}/googletest-build"

From 3646323733cc9960ad5a1a897a944bf3da2352ff Mon Sep 17 00:00:00 2001
From: "Peter A. Jonsson" <jonsson.peter.a@gmail.com>
Date: Sun, 10 Nov 2024 13:55:50 +0100
Subject: [PATCH 54/62] docker: enable IPO

This reduces the size of the small Ubuntu
image by 4 MB. There also seems to be
a slight speedup, the user time for the example
from #10809 goes from 34.x seconds to
33.x seconds on my computer.
---
 docker/ubuntu-full/bh-gdal.sh  | 4 +++-
 docker/ubuntu-small/Dockerfile | 7 ++++++-
 2 files changed, 9 insertions(+), 2 deletions(-)

diff --git a/docker/ubuntu-full/bh-gdal.sh b/docker/ubuntu-full/bh-gdal.sh
index 3c66862a6c73..b4a7659463c6 100755
--- a/docker/ubuntu-full/bh-gdal.sh
+++ b/docker/ubuntu-full/bh-gdal.sh
@@ -46,7 +46,9 @@ wget -q "https://github.com/${GDAL_REPOSITORY}/archive/${GDAL_VERSION}.tar.gz" \
     cd build
     # GDAL_USE_TIFF_INTERNAL=ON to use JXL
     export GDAL_CMAKE_EXTRA_OPTS=""
-    if test "${GCC_ARCH}" != "x86_64"; then
+    if test "${GCC_ARCH}" = "x86_64"; then
+        export GDAL_CMAKE_EXTRA_OPTS="${GDAL_CMAKE_EXTRA_OPTS} -DENABLE_IPO=ON"
+    else
         export GDAL_CMAKE_EXTRA_OPTS="${GDAL_CMAKE_EXTRA_OPTS} -DPDFIUM_INCLUDE_DIR="
     fi
     export JAVA_ARCH=""
diff --git a/docker/ubuntu-small/Dockerfile b/docker/ubuntu-small/Dockerfile
index 524f52b35488..65cf4fd17b3a 100644
--- a/docker/ubuntu-small/Dockerfile
+++ b/docker/ubuntu-small/Dockerfile
@@ -153,6 +153,11 @@ RUN --mount=type=cache,id=ubuntu-small-gdal,target=$HOME/.cache \
     && if test "x${GDAL_BUILD_IS_RELEASE:-}" = "x"; then \
         export GDAL_SHA1SUM=${GDAL_VERSION}; \
     fi \
+    && if test "${GCC_ARCH}" = "x86_64"; then \
+        export GDAL_CMAKE_EXTRA_OPTS="-DENABLE_IPO=ON"; \
+    else \
+        export GDAL_CMAKE_EXTRA_OPTS=""; \
+    fi \
     && mkdir gdal \
     && wget -q https://github.com/${GDAL_REPOSITORY}/archive/${GDAL_VERSION}.tar.gz -O - \
         | tar xz -C gdal --strip-components=1 \
@@ -183,7 +188,7 @@ RUN --mount=type=cache,id=ubuntu-small-gdal,target=$HOME/.cache \
         -DPROJ_INCLUDE_DIR="/build${PROJ_INSTALL_PREFIX-/usr/local}/include" \
         -DPROJ_LIBRARY="/build${PROJ_INSTALL_PREFIX-/usr/local}/lib/libinternalproj.so" \
         -DGDAL_USE_TIFF_INTERNAL=ON \
-        -DGDAL_USE_GEOTIFF_INTERNAL=ON \
+        -DGDAL_USE_GEOTIFF_INTERNAL=ON ${GDAL_CMAKE_EXTRA_OPTS} \
         -DBUILD_TESTING=OFF \
     && ninja \
     && DESTDIR="/build" ninja install \

From 929a51e2943ed714988701bc55cdf45ff1f94d8d Mon Sep 17 00:00:00 2001
From: Even Rouault <even.rouault@spatialys.com>
Date: Sun, 10 Nov 2024 20:44:59 +0100
Subject: [PATCH 55/62] RCM: fix opening LUT and noise level files on
 subdatasets

---
 frmts/rcm/rcmdataset.cpp | 10 ++--------
 1 file changed, 2 insertions(+), 8 deletions(-)

diff --git a/frmts/rcm/rcmdataset.cpp b/frmts/rcm/rcmdataset.cpp
index a84746d955db..23d85cc5e492 100644
--- a/frmts/rcm/rcmdataset.cpp
+++ b/frmts/rcm/rcmdataset.cpp
@@ -1988,10 +1988,7 @@ GDALDataset *RCMDataset::Open(GDALOpenInfo *poOpenInfo)
                 // If Complex, always 32 bits
                 RCMCalibRasterBand *poBand = new RCMCalibRasterBand(
                     poDS.get(), pszPole, GDT_Float32, poBandFile.release(),
-                    eCalib, CPLFormFilename(osPath, pszLUT, nullptr),
-                    CPLFormFilename(osPath, osNoiseLevelsValues.c_str(),
-                                    nullptr),
-                    eDataType);
+                    eCalib, pszLUT, osNoiseLevelsValues.c_str(), eDataType);
                 poDS->SetBand(poDS->GetRasterCount() + 1, poBand);
             }
             else
@@ -1999,10 +1996,7 @@ GDALDataset *RCMDataset::Open(GDALOpenInfo *poOpenInfo)
                 // Whatever the datatype was previoulsy set
                 RCMCalibRasterBand *poBand = new RCMCalibRasterBand(
                     poDS.get(), pszPole, eDataType, poBandFile.release(),
-                    eCalib, CPLFormFilename(osPath, pszLUT, nullptr),
-                    CPLFormFilename(osPath, osNoiseLevelsValues.c_str(),
-                                    nullptr),
-                    eDataType);
+                    eCalib, pszLUT, osNoiseLevelsValues.c_str(), eDataType);
                 poDS->SetBand(poDS->GetRasterCount() + 1, poBand);
             }
         }

From 47bed5685bc439ed48644b36bd9b3917c900a2ed Mon Sep 17 00:00:00 2001
From: Even Rouault <even.rouault@spatialys.com>
Date: Sun, 10 Nov 2024 19:37:56 +0100
Subject: [PATCH 56/62] RCM: fix CodeQL warnings about 'Multiplication result
 converted to larger type' (master only)

---
 frmts/rcm/rcmdataset.cpp | 164 +++++++++++++++------------------------
 1 file changed, 61 insertions(+), 103 deletions(-)

diff --git a/frmts/rcm/rcmdataset.cpp b/frmts/rcm/rcmdataset.cpp
index 23d85cc5e492..87044835940c 100644
--- a/frmts/rcm/rcmdataset.cpp
+++ b/frmts/rcm/rcmdataset.cpp
@@ -301,38 +301,16 @@ RCMRasterBand::~RCMRasterBand()
 CPLErr RCMRasterBand::IReadBlock(int nBlockXOff, int nBlockYOff, void *pImage)
 
 {
-    int nRequestYSize;
-    int nRequestXSize;
+    int nRequestXSize = 0;
+    int nRequestYSize = 0;
+    GetActualBlockSize(nBlockXOff, nBlockYOff, &nRequestXSize, &nRequestYSize);
 
-    /* -------------------------------------------------------------------- */
-    /*      If the last strip is partial, we need to avoid                  */
-    /*      over-requesting.  We also need to initialize the extra part     */
-    /*      of the block to zero.                                           */
-    /* -------------------------------------------------------------------- */
-    if ((nBlockYOff + 1) * nBlockYSize > nRasterYSize)
-    {
-        nRequestYSize = nRasterYSize - nBlockYOff * nBlockYSize;
-        memset(pImage, 0,
-               GDALGetDataTypeSizeBytes(eDataType) * nBlockXSize * nBlockYSize);
-    }
-    else
-    {
-        nRequestYSize = nBlockYSize;
-    }
-
-    /*-------------------------------------------------------------------- */
-    /*      If the input imagery is tiled, also need to avoid over-        */
-    /*      requesting in the X-direction.                                 */
-    /* ------------------------------------------------------------------- */
-    if ((nBlockXOff + 1) * nBlockXSize > nRasterXSize)
+    // Zero initial partial right-most and bottom-most blocks
+    if (nRequestXSize < nBlockXSize || nRequestYSize < nBlockYSize)
     {
-        nRequestXSize = nRasterXSize - nBlockXOff * nBlockXSize;
         memset(pImage, 0,
-               GDALGetDataTypeSizeBytes(eDataType) * nBlockXSize * nBlockYSize);
-    }
-    else
-    {
-        nRequestXSize = nBlockXSize;
+               static_cast<size_t>(GDALGetDataTypeSizeBytes(eDataType)) *
+                   nBlockXSize * nBlockYSize);
     }
 
     int dataTypeSize = GDALGetDataTypeSizeBytes(eDataType);
@@ -355,14 +333,16 @@ CPLErr RCMRasterBand::IReadBlock(int nBlockXOff, int nBlockYOff, void *pImage)
                 GF_Read, nBlockXOff * nBlockXSize, nBlockYOff * nBlockYSize,
                 nRequestXSize, nRequestYSize, pImage, nRequestXSize,
                 nRequestYSize, bandFileType, 2, nullptr, dataTypeSize,
-                dataTypeSize * nBlockXSize, bandFileSize, nullptr);
+                static_cast<GSpacing>(dataTypeSize) * nBlockXSize, bandFileSize,
+                nullptr);
     }
     else if (twoBandComplex && this->isNITF)
     {
         return poBand->RasterIO(
             GF_Read, nBlockXOff * nBlockXSize, nBlockYOff * nBlockYSize,
             nRequestXSize, nRequestYSize, pImage, nRequestXSize, nRequestYSize,
-            eDataType, 0, dataTypeSize * nBlockXSize, nullptr);
+            eDataType, 0, static_cast<GSpacing>(dataTypeSize) * nBlockXSize,
+            nullptr);
     }
 
     if (poRCMDataset->IsComplexData())
@@ -377,7 +357,8 @@ CPLErr RCMRasterBand::IReadBlock(int nBlockXOff, int nBlockYOff, void *pImage)
                 GF_Read, nBlockXOff * nBlockXSize, nBlockYOff * nBlockYSize,
                 nRequestXSize, nRequestYSize, pImage, nRequestXSize,
                 nRequestYSize, bandFileType, 2, nullptr, dataTypeSize,
-                nBlockXSize * dataTypeSize, bandFileSize, nullptr);
+                static_cast<GSpacing>(dataTypeSize) * nBlockXSize, bandFileSize,
+                nullptr);
     }
 
     // case: band file == this band
@@ -388,7 +369,8 @@ CPLErr RCMRasterBand::IReadBlock(int nBlockXOff, int nBlockYOff, void *pImage)
         return poBand->RasterIO(
             GF_Read, nBlockXOff * nBlockXSize, nBlockYOff * nBlockYSize,
             nRequestXSize, nRequestYSize, pImage, nRequestXSize, nRequestYSize,
-            eDataType, 0, dataTypeSize * nBlockXSize, nullptr);
+            eDataType, 0, static_cast<GSpacing>(dataTypeSize) * nBlockXSize,
+            nullptr);
     }
     else
     {
@@ -484,8 +466,7 @@ void RCMCalibRasterBand::ReadLUT()
 
     const size_t nLen =
         this->m_nTableSize * max_space_for_string;  // 32 max + space
-    char *lut_gains = static_cast<char *>(CPLMalloc(nLen));
-    memset(lut_gains, 0, nLen);
+    char *lut_gains = static_cast<char *>(CPLCalloc(1, nLen));
 
     for (int i = 0; i < this->m_nTableSize; i++)
     {
@@ -680,53 +661,32 @@ CPLErr RCMCalibRasterBand::IReadBlock(int nBlockXOff, int nBlockYOff,
                                       void *pImage)
 {
     CPLErr eErr;
-    int nRequestYSize;
-    int nRequestXSize;
+    int nRequestXSize = 0;
+    int nRequestYSize = 0;
+    GetActualBlockSize(nBlockXOff, nBlockYOff, &nRequestXSize, &nRequestYSize);
 
-    /* -------------------------------------------------------------------- */
-    /*      If the last strip is partial, we need to avoid                  */
-    /*      over-requesting.  We also need to initialize the extra part     */
-    /*      of the block to zero.                                           */
-    /* -------------------------------------------------------------------- */
-    if ((nBlockYOff + 1) * nBlockYSize > nRasterYSize)
+    // Zero initial partial right-most and bottom-most blocks
+    if (nRequestXSize < nBlockXSize || nRequestYSize < nBlockYSize)
     {
-        nRequestYSize = nRasterYSize - nBlockYOff * nBlockYSize;
         memset(pImage, 0,
-               GDALGetDataTypeSizeBytes(eDataType) * nBlockXSize * nBlockYSize);
-    }
-    else
-    {
-        nRequestYSize = nBlockYSize;
-    }
-
-    /*-------------------------------------------------------------------- */
-    /*      If the input imagery is tiled, also need to avoid over-        */
-    /*      requesting in the X-direction.                                 */
-    /* ------------------------------------------------------------------- */
-    if ((nBlockXOff + 1) * nBlockXSize > nRasterXSize)
-    {
-        nRequestXSize = nRasterXSize - nBlockXOff * nBlockXSize;
-        memset(pImage, 0,
-               GDALGetDataTypeSizeBytes(eDataType) * nBlockXSize * nBlockYSize);
-    }
-    else
-    {
-        nRequestXSize = nBlockXSize;
+               static_cast<size_t>(GDALGetDataTypeSizeBytes(eDataType)) *
+                   nBlockXSize * nBlockYSize);
     }
 
     if (this->m_eOriginalType == GDT_CInt16)
     {
-        GInt16 *pnImageTmp;
         /* read in complex values */
-        pnImageTmp = static_cast<GInt16 *>(
-            CPLMalloc(nBlockXSize * nBlockYSize *
-                      GDALGetDataTypeSizeBytes(m_eOriginalType)));
+        GInt16 *panImageTmp = static_cast<GInt16 *>(
+            VSI_MALLOC3_VERBOSE(nBlockXSize, nBlockYSize,
+                                GDALGetDataTypeSizeBytes(m_eOriginalType)));
+        if (!panImageTmp)
+            return CE_Failure;
 
         if (m_poBandDataset->GetRasterCount() == 2)
         {
             eErr = m_poBandDataset->RasterIO(
                 GF_Read, nBlockXOff * nBlockXSize, nBlockYOff * nBlockYSize,
-                nRequestXSize, nRequestYSize, pnImageTmp, nRequestXSize,
+                nRequestXSize, nRequestYSize, panImageTmp, nRequestXSize,
                 nRequestYSize, this->m_eOriginalType, 2, nullptr, 4,
                 nBlockXSize * 4, 4, nullptr);
 
@@ -735,7 +695,7 @@ CPLErr RCMCalibRasterBand::IReadBlock(int nBlockXOff, int nBlockYOff,
                 nBlockXOff * nBlockXSize,
                 nBlockYOff * nBlockYSize,
                 nRequestXSize, nRequestYSize,
-                pnImageTmp, nRequestXSize, nRequestYSize,
+                panImageTmp, nRequestXSize, nRequestYSize,
                 GDT_Int32,
                 2, nullptr, 4, nBlockXSize * 4, 2, nullptr);
             */
@@ -744,7 +704,7 @@ CPLErr RCMCalibRasterBand::IReadBlock(int nBlockXOff, int nBlockYOff,
         {
             eErr = m_poBandDataset->RasterIO(
                 GF_Read, nBlockXOff * nBlockXSize, nBlockYOff * nBlockYSize,
-                nRequestXSize, nRequestYSize, pnImageTmp, nRequestXSize,
+                nRequestXSize, nRequestYSize, panImageTmp, nRequestXSize,
                 nRequestYSize, this->m_eOriginalType, 1, nullptr, 4,
                 nBlockXSize * 4, 0, nullptr);
 
@@ -754,7 +714,7 @@ CPLErr RCMCalibRasterBand::IReadBlock(int nBlockXOff, int nBlockYOff,
                     nBlockXOff * nBlockXSize,
                     nBlockYOff * nBlockYSize,
                     nRequestXSize, nRequestYSize,
-                    pnImageTmp, nRequestXSize, nRequestYSize,
+                    panImageTmp, nRequestXSize, nRequestYSize,
                     GDT_UInt32,
                     1, nullptr, 4, nBlockXSize * 4, 0, nullptr);
             */
@@ -778,8 +738,8 @@ CPLErr RCMCalibRasterBand::IReadBlock(int nBlockXOff, int nBlockYOff,
                 const int nTruePixOff = (i * nBlockXSize) + j;
 
                 // Formula for Complex Q+J
-                const float real = static_cast<float>(pnImageTmp[nPixOff]);
-                const float img = static_cast<float>(pnImageTmp[nPixOff + 1]);
+                const float real = static_cast<float>(panImageTmp[nPixOff]);
+                const float img = static_cast<float>(panImageTmp[nPixOff + 1]);
                 const float digitalValue = (real * real) + (img * img);
                 const float lutValue =
                     static_cast<float>(m_nfTable[nBlockXOff * nBlockXSize + j]);
@@ -789,7 +749,7 @@ CPLErr RCMCalibRasterBand::IReadBlock(int nBlockXOff, int nBlockYOff,
             }
         }
 
-        CPLFree(pnImageTmp);
+        CPLFree(panImageTmp);
     }
 
     // If the underlying file is NITF CFloat32
@@ -797,25 +757,26 @@ CPLErr RCMCalibRasterBand::IReadBlock(int nBlockXOff, int nBlockYOff,
              this->m_eOriginalType == GDT_CFloat64)
     {
         /* read in complex values */
-        float *pnImageTmp;
-
         const int dataTypeSize =
             GDALGetDataTypeSizeBytes(this->m_eOriginalType);
         const GDALDataType bandFileType = this->m_eOriginalType;
-        const int bandFileSize = GDALGetDataTypeSizeBytes(bandFileType);
+        const int bandFileDataTypeSize = GDALGetDataTypeSizeBytes(bandFileType);
 
         /* read the original image complex values in a temporary image space */
-        pnImageTmp = static_cast<float *>(
-            CPLMalloc(2 * nBlockXSize * nBlockYSize * bandFileSize));
+        float *pafImageTmp = static_cast<float *>(VSI_MALLOC3_VERBOSE(
+            nBlockXSize, nBlockYSize, 2 * bandFileDataTypeSize));
+        if (!pafImageTmp)
+            return CE_Failure;
 
         eErr =
             // I and Q from each band are pixel-interleaved into this complex
             // band
             m_poBandDataset->RasterIO(
                 GF_Read, nBlockXOff * nBlockXSize, nBlockYOff * nBlockYSize,
-                nRequestXSize, nRequestYSize, pnImageTmp, nRequestXSize,
+                nRequestXSize, nRequestYSize, pafImageTmp, nRequestXSize,
                 nRequestYSize, bandFileType, 2, nullptr, dataTypeSize,
-                nBlockXSize * dataTypeSize, bandFileSize, nullptr);
+                static_cast<GSpacing>(dataTypeSize) * nBlockXSize,
+                bandFileDataTypeSize, nullptr);
 
         /* calibrate the complex values */
         for (int i = 0; i < nRequestYSize; i++)
@@ -827,8 +788,8 @@ CPLErr RCMCalibRasterBand::IReadBlock(int nBlockXOff, int nBlockYOff,
                 const int nTruePixOff = (i * nBlockXSize) + j;
 
                 // Formula for Complex Q+J
-                const float real = static_cast<float>(pnImageTmp[nPixOff]);
-                const float img = static_cast<float>(pnImageTmp[nPixOff + 1]);
+                const float real = pafImageTmp[nPixOff];
+                const float img = pafImageTmp[nPixOff + 1];
                 const float digitalValue = (real * real) + (img * img);
                 const float lutValue =
                     static_cast<float>(m_nfTable[nBlockXOff * nBlockXSize + j]);
@@ -838,7 +799,7 @@ CPLErr RCMCalibRasterBand::IReadBlock(int nBlockXOff, int nBlockYOff,
             }
         }
 
-        CPLFree(pnImageTmp);
+        CPLFree(pafImageTmp);
     }
 
     else if (this->m_eOriginalType == GDT_Float32)
@@ -907,13 +868,14 @@ CPLErr RCMCalibRasterBand::IReadBlock(int nBlockXOff, int nBlockYOff,
 
     else if (this->m_eOriginalType == GDT_UInt16)
     {
-        GUInt16 *pnImageTmp;
         /* read in detected values */
-        pnImageTmp = static_cast<GUInt16 *>(CPLMalloc(
-            nBlockXSize * nBlockYSize * GDALGetDataTypeSizeBytes(GDT_UInt16)));
+        GUInt16 *panImageTmp = static_cast<GUInt16 *>(VSI_MALLOC3_VERBOSE(
+            nBlockXSize, nBlockYSize, GDALGetDataTypeSizeBytes(GDT_UInt16)));
+        if (!panImageTmp)
+            return CE_Failure;
         eErr = m_poBandDataset->RasterIO(
             GF_Read, nBlockXOff * nBlockXSize, nBlockYOff * nBlockYSize,
-            nRequestXSize, nRequestYSize, pnImageTmp, nRequestXSize,
+            nRequestXSize, nRequestYSize, panImageTmp, nRequestXSize,
             nRequestYSize, GDT_UInt16, 1, nullptr, 2, nBlockXSize * 2, 0,
             nullptr);
 
@@ -925,7 +887,7 @@ CPLErr RCMCalibRasterBand::IReadBlock(int nBlockXOff, int nBlockYOff,
                 const int nPixOff = (i * nBlockXSize) + j;
 
                 const float digitalValue =
-                    static_cast<float>(pnImageTmp[nPixOff]);
+                    static_cast<float>(panImageTmp[nPixOff]);
                 const float A =
                     static_cast<float>(m_nfTable[nBlockXOff * nBlockXSize + j]);
                 reinterpret_cast<float *>(pImage)[nPixOff] =
@@ -934,16 +896,18 @@ CPLErr RCMCalibRasterBand::IReadBlock(int nBlockXOff, int nBlockYOff,
                     A;
             }
         }
-        CPLFree(pnImageTmp);
+        CPLFree(panImageTmp);
     } /* Ticket #2104: Support for ScanSAR products */
 
     else if (this->m_eOriginalType == GDT_Byte)
     {
-        GByte *pnImageTmp;
-        pnImageTmp = static_cast<GByte *>(CPLMalloc(nBlockXSize * nBlockYSize));
+        GByte *pabyImageTmp =
+            static_cast<GByte *>(VSI_MALLOC2_VERBOSE(nBlockXSize, nBlockYSize));
+        if (!pabyImageTmp)
+            return CE_Failure;
         eErr = m_poBandDataset->RasterIO(
             GF_Read, nBlockXOff * nBlockXSize, nBlockYOff * nBlockYSize,
-            nRequestXSize, nRequestYSize, pnImageTmp, nRequestXSize,
+            nRequestXSize, nRequestYSize, pabyImageTmp, nRequestXSize,
             nRequestYSize, GDT_Byte, 1, nullptr, 1, nBlockXSize, 0, nullptr);
 
         /* iterate over detected values */
@@ -954,7 +918,7 @@ CPLErr RCMCalibRasterBand::IReadBlock(int nBlockXOff, int nBlockYOff,
                 const int nPixOff = (i * nBlockXSize) + j;
 
                 const float digitalValue =
-                    static_cast<float>(pnImageTmp[nPixOff]);
+                    static_cast<float>(pabyImageTmp[nPixOff]);
                 const float A =
                     static_cast<float>(m_nfTable[nBlockXOff * nBlockXSize + j]);
                 reinterpret_cast<float *>(pImage)[nPixOff] =
@@ -963,7 +927,7 @@ CPLErr RCMCalibRasterBand::IReadBlock(int nBlockXOff, int nBlockYOff,
                     A;
             }
         }
-        CPLFree(pnImageTmp);
+        CPLFree(pabyImageTmp);
     }
     else
     {
@@ -1220,14 +1184,8 @@ GDALDataset *RCMDataset::Open(GDALOpenInfo *poOpenInfo)
         psSceneAttributes, "imageAttributes.samplesPerLine", "-1"));
     poDS->nRasterYSize = atoi(
         CPLGetXMLValue(psSceneAttributes, "imageAttributes.numLines", "-1"));
-    if (poDS->nRasterXSize <= 1 || poDS->nRasterYSize <= 1)
+    if (!GDALCheckDatasetDimensions(poDS->nRasterXSize, poDS->nRasterYSize))
     {
-        CPLError(
-            CE_Failure, CPLE_OpenFailed,
-            "ERROR: Non-sane raster dimensions provided in product.xml. If "
-            "this is "
-            "a valid RCM scene, please contact your data provider for "
-            "a corrected dataset.");
         return nullptr;
     }
 

From 81b65021297f37192a7f662421c0e8783f52b488 Mon Sep 17 00:00:00 2001
From: Even Rouault <even.rouault@spatialys.com>
Date: Sun, 10 Nov 2024 21:35:44 +0100
Subject: [PATCH 57/62] RCM: harden against excessive memory allocation

---
 frmts/rcm/rcmdataset.cpp | 23 +++++++++++++++++++----
 1 file changed, 19 insertions(+), 4 deletions(-)

diff --git a/frmts/rcm/rcmdataset.cpp b/frmts/rcm/rcmdataset.cpp
index 87044835940c..46248c14c7c1 100644
--- a/frmts/rcm/rcmdataset.cpp
+++ b/frmts/rcm/rcmdataset.cpp
@@ -79,7 +79,10 @@ static double *InterpolateValues(CSLConstList papszList, int tableSize,
                                  int pixelFirstLutValue)
 {
     /* Allocate the right LUT size according to the product range pixel */
-    double *table = static_cast<double *>(CPLCalloc(sizeof(double), tableSize));
+    double *table =
+        static_cast<double *>(VSI_CALLOC_VERBOSE(sizeof(double), tableSize));
+    if (!table)
+        return nullptr;
 
     if (stepSize <= 0)
     {
@@ -459,14 +462,26 @@ void RCMCalibRasterBand::ReadLUT()
         return;
     }
 
+    // Avoid excessive memory allocation
+    if (this->m_nTableSize > 1000 * 1000)
+    {
+        CPLError(CE_Failure, CPLE_NotSupported, "Too many elements in LUT: %d",
+                 this->m_nTableSize);
+        return;
+    }
+
     /* Allocate the right LUT size according to the product range pixel */
     this->m_nfTable =
         InterpolateValues(aosLUTList.List(), this->m_nTableSize, this->stepSize,
                           this->numberOfValues, this->pixelFirstLutValue);
+    if (!this->m_nfTable)
+        return;
 
-    const size_t nLen =
-        this->m_nTableSize * max_space_for_string;  // 32 max + space
-    char *lut_gains = static_cast<char *>(CPLCalloc(1, nLen));
+    // 32 max + space
+    char *lut_gains = static_cast<char *>(
+        VSI_CALLOC_VERBOSE(this->m_nTableSize, max_space_for_string));
+    if (!lut_gains)
+        return;
 
     for (int i = 0; i < this->m_nTableSize; i++)
     {

From fbe5743bb752f62cbc65b428295eff26cb3e8a6a Mon Sep 17 00:00:00 2001
From: Even Rouault <even.rouault@spatialys.com>
Date: Sun, 10 Nov 2024 23:07:40 +0100
Subject: [PATCH 58/62] autotest: fix pyarrow 18 compatibility

---
 autotest/ogr/ogr_mem.py | 9 ++++++---
 1 file changed, 6 insertions(+), 3 deletions(-)

diff --git a/autotest/ogr/ogr_mem.py b/autotest/ogr/ogr_mem.py
index 3d26ac31a00b..e763eb031d7c 100755
--- a/autotest/ogr/ogr_mem.py
+++ b/autotest/ogr/ogr_mem.py
@@ -2872,9 +2872,12 @@ def test_ogr_mem_arrow_json():
     lyr.CreateField(field_def)
 
     stream = lyr.GetArrowStreamAsPyArrow()
-    md = stream.schema["field_json"].metadata
-    assert b"ARROW:extension:name" in md
-    assert md[b"ARROW:extension:name"] == b"arrow.json"
+    field_schema = stream.schema["field_json"]
+    # Since pyarrow 18, the field type is extension<arrow.json>
+    if str(field_schema.type) != "extension<arrow.json>":
+        md = field_schema.metadata
+        assert b"ARROW:extension:name" in md
+        assert md[b"ARROW:extension:name"] == b"arrow.json"
 
 
 ###############################################################################

From 67c45efb8836ed3bcdb826bbca6901bab64f2112 Mon Sep 17 00:00:00 2001
From: Even Rouault <even.rouault@spatialys.com>
Date: Mon, 11 Nov 2024 02:30:22 +0100
Subject: [PATCH 59/62] GDALRasterBand::ComputeRasterMinMaxLocation(): fix
 memleak in error code path (master only)

---
 gcore/gdalrasterband.cpp | 6 +++---
 1 file changed, 3 insertions(+), 3 deletions(-)

diff --git a/gcore/gdalrasterband.cpp b/gcore/gdalrasterband.cpp
index e84a20fdbef9..87c814efdedf 100644
--- a/gcore/gdalrasterband.cpp
+++ b/gcore/gdalrasterband.cpp
@@ -7473,6 +7473,9 @@ CPLErr GDALRasterBand::ComputeRasterMinMaxLocation(double *pdfMin,
         return CE_Failure;
     }
 
+    if (!InitBlockInfo())
+        return CE_Failure;
+
     int bGotNoDataValue = FALSE;
     const double dfNoDataValue = GetNoDataValue(&bGotNoDataValue);
     bGotNoDataValue = bGotNoDataValue && !std::isnan(dfNoDataValue);
@@ -7514,9 +7517,6 @@ CPLErr GDALRasterBand::ComputeRasterMinMaxLocation(double *pdfMin,
         }
     }
 
-    if (!InitBlockInfo())
-        return CE_Failure;
-
     const GIntBig nTotalBlocks =
         static_cast<GIntBig>(nBlocksPerRow) * nBlocksPerColumn;
     bool bNeedsMin = pdfMin || pnMinX || pnMinY;

From e85d36397f4b5c20318773b467bdaf62b5f4d71e Mon Sep 17 00:00:00 2001
From: Even Rouault <even.rouault@spatialys.com>
Date: Mon, 11 Nov 2024 02:30:49 +0100
Subject: [PATCH 60/62] Silence various non interesting Coverity Scan reports

---
 autotest/cpp/test_ogr.cpp             |  3 +++
 frmts/gtiff/tifvsi.cpp                |  3 +++
 ogr/ogrcurve.cpp                      |  2 +-
 ogr/ogrgeometryfactory.cpp            |  2 +-
 ogr/ogrsf_frmts/dxf/ogrdxf_leader.cpp |  2 +-
 port/cpl_worker_thread_pool.cpp       | 12 ++++++------
 6 files changed, 15 insertions(+), 9 deletions(-)

diff --git a/autotest/cpp/test_ogr.cpp b/autotest/cpp/test_ogr.cpp
index 4816b663dfa8..8a91c026f57b 100644
--- a/autotest/cpp/test_ogr.cpp
+++ b/autotest/cpp/test_ogr.cpp
@@ -69,6 +69,7 @@ void testSpatialReferenceLeakOnCopy(OGRSpatialReference *poSRS)
         ASSERT_GT(nCurCount, nLastCount);
         nLastCount = nCurCount;
 
+        // coverity[copy_assignment_call]
         value3 = value;
         ASSERT_EQ(nLastCount, poSRS->GetReferenceCount());
     }
@@ -327,6 +328,7 @@ TEST_F(test_ogr, OGRGeometryCollection_copy_constructor_illegal_use)
     CPLErrorReset();
     {
         CPLErrorHandlerPusher oPusher(CPLQuietErrorHandler);
+        // coverity[copy_assignment_call]
         *mp_as_gc = gc;
     }
     EXPECT_STREQ(CPLGetLastErrorMsg(),
@@ -360,6 +362,7 @@ TEST_F(test_ogr, OGRCurvePolygon_copy_constructor_illegal_use)
     CPLErrorReset();
     {
         CPLErrorHandlerPusher oPusher(CPLQuietErrorHandler);
+        // coverity[copy_assignment_call]
         *poly_as_cp = cp;
     }
     EXPECT_STREQ(CPLGetLastErrorMsg(),
diff --git a/frmts/gtiff/tifvsi.cpp b/frmts/gtiff/tifvsi.cpp
index afc77e171e2e..a889f3cad534 100644
--- a/frmts/gtiff/tifvsi.cpp
+++ b/frmts/gtiff/tifvsi.cpp
@@ -439,7 +439,10 @@ static void VSI_TIFFSetOpenOptions(TIFFOpenOptions *opts)
         {
             const auto nUsableRAM = CPLGetUsablePhysicalRAM();
             if (nUsableRAM > 0)
+            {
+                // coverity[return_overflow]
                 return nUsableRAM / 10 * 9;
+            }
             else
                 return 0;
         }
diff --git a/ogr/ogrcurve.cpp b/ogr/ogrcurve.cpp
index 4e8b7d77f255..260e875571d1 100644
--- a/ogr/ogrcurve.cpp
+++ b/ogr/ogrcurve.cpp
@@ -756,7 +756,7 @@ int OGRCurve::isClockwise() const
     for (int i = 1; i < nPointCount - 1; i++)
     {
         ++oIter;
-        OGRPoint oPointCur = *oIter;
+        const OGRPoint oPointCur = *oIter;
         if (bNextPointIsNextSel)
         {
             oPointNextSel = oPointCur;
diff --git a/ogr/ogrgeometryfactory.cpp b/ogr/ogrgeometryfactory.cpp
index e68303667da9..dd4d5833d05d 100644
--- a/ogr/ogrgeometryfactory.cpp
+++ b/ogr/ogrgeometryfactory.cpp
@@ -2010,7 +2010,7 @@ OGRGeometry *OGRGeometryFactory::organizePolygons(OGRGeometry **papoPolygons,
                                 // If it is outside, then i cannot be inside j.
                                 break;
                             }
-                            previousPoint = point;
+                            previousPoint = std::move(point);
                         }
                         if (!b_i_inside_j && k == nPoints && nPoints > 2)
                         {
diff --git a/ogr/ogrsf_frmts/dxf/ogrdxf_leader.cpp b/ogr/ogrsf_frmts/dxf/ogrdxf_leader.cpp
index 4f5ae6ba6f4d..24ca0f768cac 100644
--- a/ogr/ogrsf_frmts/dxf/ogrdxf_leader.cpp
+++ b/ogr/ogrsf_frmts/dxf/ogrdxf_leader.cpp
@@ -1369,7 +1369,7 @@ static void InterpolateSpline(OGRLineString *const poLine,
 
         aoDataPoints.push_back(
             DXFTriple(oPoint.getX(), oPoint.getY(), oPoint.getZ()));
-        oPrevPoint = oPoint;
+        oPrevPoint = std::move(oPoint);
     }
     nDataPoints = static_cast<int>(aoDataPoints.size());
     if (nDataPoints < 2)
diff --git a/port/cpl_worker_thread_pool.cpp b/port/cpl_worker_thread_pool.cpp
index c2134f0b7a30..2205dd952c67 100644
--- a/port/cpl_worker_thread_pool.cpp
+++ b/port/cpl_worker_thread_pool.cpp
@@ -583,12 +583,12 @@ bool CPLJobQueue::SubmitJob(std::function<void()> task)
 
     // cppcheck-suppress knownConditionTrueFalse
     // coverity[uninit_member,copy_constructor_call]
-    return m_poPool->SubmitJob(
-        [this, task]
-        {
-            task();
-            DeclareJobFinished();
-        });
+    const auto lambda = [this, task]
+    {
+        task();
+        DeclareJobFinished();
+    };
+    return m_poPool->SubmitJob(lambda);
 }
 
 /************************************************************************/

From 971762d900faee5996faa667080cf69e67fd6ea4 Mon Sep 17 00:00:00 2001
From: Even Rouault <even.rouault@spatialys.com>
Date: Mon, 11 Nov 2024 03:26:52 +0100
Subject: [PATCH 61/62] Fix Coverity suppression

---
 port/cpl_worker_thread_pool.cpp | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/port/cpl_worker_thread_pool.cpp b/port/cpl_worker_thread_pool.cpp
index 2205dd952c67..387ad7bbba43 100644
--- a/port/cpl_worker_thread_pool.cpp
+++ b/port/cpl_worker_thread_pool.cpp
@@ -581,13 +581,13 @@ bool CPLJobQueue::SubmitJob(std::function<void()> task)
         m_nPendingJobs++;
     }
 
-    // cppcheck-suppress knownConditionTrueFalse
     // coverity[uninit_member,copy_constructor_call]
     const auto lambda = [this, task]
     {
         task();
         DeclareJobFinished();
     };
+    // cppcheck-suppress knownConditionTrueFalse
     return m_poPool->SubmitJob(lambda);
 }
 

From 6b9549ea7a302857b08751e6b082c033e5f73e8b Mon Sep 17 00:00:00 2001
From: quassy <369996+quassy@users.noreply.github.com>
Date: Mon, 11 Nov 2024 16:26:28 +0300
Subject: [PATCH 62/62] Doc: Clarify Ubuntu version is 24.04 for GDAL 3.10
 (#11244) [ci skip]

---
 docker/README.md | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/docker/README.md b/docker/README.md
index 6ade587c2ca0..74d3a1d865a3 100644
--- a/docker/README.md
+++ b/docker/README.md
@@ -47,7 +47,7 @@ See [alpine-normal/Dockerfile](alpine-normal/Dockerfile)
 # Ubuntu based
 
 Ubuntu version:
-* 24.04 for GDAL 3.9
+* 24.04 for GDAL 3.9 and 3.10
 * 22.04 for GDAL 3.6, 3.7 and 3.8
 * 20.04 for GDAL 3.4 and 3.5