From 1bc62e7114da9bcb68027bf050e418e292aa138f Mon Sep 17 00:00:00 2001
From: Troels Henriksen <athas@sigkill.dk>
Date: Tue, 4 Feb 2025 08:31:37 +0100
Subject: [PATCH] Support newer ISPCs.

I also had to remove some stuff that caused ISPC to crash, and a
couple of tests also fail now. This backend is a bit rickety as it
does not see much maintenance.
---
 CHANGELOG.md                                  |   2 +
 rts/c/ispc_util.h                             |  22 ++--
 rts/c/scalar.h                                |  26 ++---
 rts/c/scalar_f16.h                            |  12 +-
 rts/c/uniform.h                               |   9 +-
 src/Futhark/CodeGen/Backends/MulticoreC.hs    |   2 +-
 src/Futhark/CodeGen/Backends/MulticoreISPC.hs |   2 +-
 tests/ad/issue1473.fut                        | 103 +++++++++---------
 tests/tiling/tiling_1d_complex.fut            |  24 ++--
 9 files changed, 101 insertions(+), 101 deletions(-)

diff --git a/CHANGELOG.md b/CHANGELOG.md
index 334bbbfba8..f5607ceab0 100644
--- a/CHANGELOG.md
+++ b/CHANGELOG.md
@@ -35,6 +35,8 @@ and this project adheres to [Semantic Versioning](http://semver.org/spec/v2.0.0.
 
 * A bug in the "sink" optimisation pass could cause compiler crashes.
 
+* Compile errors with newer versions of `ispc`.
+
 ## [0.25.26]
 
 ### Fixed
diff --git a/rts/c/ispc_util.h b/rts/c/ispc_util.h
index 7a5b53ff7a..45224b7839 100644
--- a/rts/c/ispc_util.h
+++ b/rts/c/ispc_util.h
@@ -27,17 +27,17 @@ make_extract(uint64)
 make_extract(float16)
 make_extract(float)
 make_extract(double)
-make_extract(int8* uniform)
-make_extract(int16* uniform)
-make_extract(int32* uniform)
-make_extract(int64* uniform)
-make_extract(uint8* uniform)
-make_extract(uint16* uniform)
-make_extract(uint32* uniform)
-make_extract(uint64* uniform)
-make_extract(float16* uniform)
-make_extract(float* uniform)
-make_extract(double* uniform)
+/* make_extract(int8* uniform) */
+/* make_extract(int16* uniform) */
+/* make_extract(int32* uniform) */
+/* make_extract(int64* uniform) */
+/* make_extract(uint8* uniform) */
+/* make_extract(uint16* uniform) */
+/* make_extract(uint32* uniform) */
+/* make_extract(uint64* uniform) */
+/* make_extract(float16* uniform) */
+/* make_extract(float* uniform) */
+/* make_extract(double* uniform) */
 make_extract(struct futhark_context)
 make_extract(struct memblock)
 
diff --git a/rts/c/scalar.h b/rts/c/scalar.h
index 7435374a97..0e7caf08fc 100644
--- a/rts/c/scalar.h
+++ b/rts/c/scalar.h
@@ -68,7 +68,7 @@ SCALAR_FUN_ATTR uint64_t mul64(uint64_t x, uint64_t y) {
   return x * y;
 }
 
-#if ISPC
+#if defined(ISPC)
 
 SCALAR_FUN_ATTR uint8_t udiv8(uint8_t x, uint8_t y) {
   // This strange pattern is used to prevent the ISPC compiler from
@@ -1309,7 +1309,7 @@ SCALAR_FUN_ATTR  uint8_t futrts_smul_hi8 ( int8_t a, int8_t b) { return ((int16_
 SCALAR_FUN_ATTR uint16_t futrts_smul_hi16(int16_t a, int16_t b) { return ((int32_t)a) * ((int32_t)b) >> 16; }
 SCALAR_FUN_ATTR uint32_t futrts_smul_hi32(int32_t a, int32_t b) { return __mulhi(a, b); }
 SCALAR_FUN_ATTR uint64_t futrts_smul_hi64(int64_t a, int64_t b) { return __mul64hi(a, b); }
-#elif ISPC
+#elif defined(ISPC)
 SCALAR_FUN_ATTR uint8_t futrts_umul_hi8(uint8_t a, uint8_t b) { return ((uint16_t)a) * ((uint16_t)b) >> 8; }
 SCALAR_FUN_ATTR uint16_t futrts_umul_hi16(uint16_t a, uint16_t b) { return ((uint32_t)a) * ((uint32_t)b) >> 16; }
 SCALAR_FUN_ATTR uint32_t futrts_umul_hi32(uint32_t a, uint32_t b) { return ((uint64_t)a) * ((uint64_t)b) >> 32; }
@@ -1430,7 +1430,7 @@ SCALAR_FUN_ATTR int32_t futrts_clzz64(int64_t x) {
   return __clzll(x);
 }
 
-#elif ISPC
+#elif defined(ISPC)
 
 SCALAR_FUN_ATTR int32_t futrts_clzz8(int8_t x) {
   return count_leading_zeros((int32_t)(uint8_t)x)-24;
@@ -1518,7 +1518,7 @@ SCALAR_FUN_ATTR int32_t futrts_ctzz64(int64_t x) {
   return y == 0 ? 64 : y - 1;
 }
 
-#elif ISPC
+#elif defined(ISPC)
 
 SCALAR_FUN_ATTR int32_t futrts_ctzz8(int8_t x) {
   return x == 0 ? 8 : count_trailing_zeros((int32_t)x);
@@ -1628,7 +1628,7 @@ SCALAR_FUN_ATTR float fpow32(float x, float y) {
   return pow(x, y);
 }
 
-#elif ISPC
+#elif defined(ISPC)
 
 SCALAR_FUN_ATTR float fabs32(float x) {
   return abs(x);
@@ -1645,7 +1645,7 @@ SCALAR_FUN_ATTR float fmin32(float x, float y) {
 SCALAR_FUN_ATTR float fpow32(float a, float b) {
   float ret;
   foreach_active (i) {
-      uniform float r = __stdlib_powf(extract(a, i), extract(b, i));
+      uniform float r = pow(extract(a, i), extract(b, i));
       ret = insert(ret, i, r);
   }
   return ret;
@@ -1674,7 +1674,7 @@ SCALAR_FUN_ATTR bool futrts_isnan32(float x) {
   return isnan(x);
 }
 
-#if ISPC
+#if defined(ISPC)
 
 SCALAR_FUN_ATTR bool futrts_isinf32(float x) {
   return !isnan(x) && isnan(x - x);
@@ -1905,7 +1905,7 @@ SCALAR_FUN_ATTR float futrts_fma32(float a, float b, float c) {
   return fma(a, b, c);
 }
 
-#elif ISPC
+#elif defined(ISPC)
 
 SCALAR_FUN_ATTR float futrts_log32(float x) {
   return futrts_isfinite32(x) || (futrts_isinf32(x) && x < 0)? log(x) : x;
@@ -2107,7 +2107,7 @@ SCALAR_FUN_ATTR float futrts_lerp32(float v0, float v1, float t) {
 }
 
 SCALAR_FUN_ATTR float futrts_ldexp32(float x, int32_t y) {
-  return x * pow((double)2.0, (double)y);
+  return x * pow((uniform float)2.0, (float)y);
 }
 
 SCALAR_FUN_ATTR float futrts_copysign32(float x, float y) {
@@ -2267,7 +2267,7 @@ SCALAR_FUN_ATTR float futrts_fma32(float a, float b, float c) {
 }
 #endif
 
-#if ISPC
+#if defined(ISPC)
 SCALAR_FUN_ATTR int32_t futrts_to_bits32(float x) {
   return intbits(x);
 }
@@ -2306,7 +2306,7 @@ SCALAR_FUN_ATTR float fsignum32(float x) {
 SCALAR_FUN_ATTR double futrts_from_bits64(int64_t x);
 SCALAR_FUN_ATTR int64_t futrts_to_bits64(double x);
 
-#if ISPC
+#if defined(ISPC)
 SCALAR_FUN_ATTR bool futrts_isinf64(float x) {
   return !isnan(x) && isnan(x - x);
 }
@@ -2386,7 +2386,7 @@ SCALAR_FUN_ATTR double fmin64(double x, double y) {
 SCALAR_FUN_ATTR double fpow64(double a, double b) {
   float ret;
   foreach_active (i) {
-      uniform float r = __stdlib_powf(extract(a, i), extract(b, i));
+      uniform float r = pow(extract(a, i), extract(b, i));
       ret = insert(ret, i, r);
   }
   return ret;
@@ -2673,7 +2673,7 @@ SCALAR_FUN_ATTR double futrts_lerp64(double v0, double v1, double t) {
 }
 
 SCALAR_FUN_ATTR double futrts_ldexp64(double x, int32_t y) {
-  return x * pow((double)2.0, (double)y);
+  return x * pow((uniform double)2.0, (double)y);
 }
 
 SCALAR_FUN_ATTR double futrts_copysign64(double x, double y) {
diff --git a/rts/c/scalar_f16.h b/rts/c/scalar_f16.h
index f6387c2c0b..eb20c7806e 100644
--- a/rts/c/scalar_f16.h
+++ b/rts/c/scalar_f16.h
@@ -23,7 +23,7 @@
 // compiler will have to be real careful!
 typedef float f16;
 
-#elif ISPC
+#elif defined(ISPC)
 typedef float16 f16;
 
 #else
@@ -154,7 +154,7 @@ SCALAR_FUN_ATTR f16 fpow16(f16 x, f16 y) {
   return pow(x, y);
 }
 
-#elif ISPC
+#elif defined(ISPC)
 SCALAR_FUN_ATTR f16 fabs16(f16 x) {
   return abs(x);
 }
@@ -190,7 +190,7 @@ SCALAR_FUN_ATTR f16 fpow16(f16 x, f16 y) {
 }
 #endif
 
-#if ISPC
+#if defined(ISPC)
 SCALAR_FUN_ATTR bool futrts_isinf16(float x) {
   return !futrts_isnan16(x) && futrts_isnan16(x - x);
 }
@@ -345,7 +345,7 @@ SCALAR_FUN_ATTR f16 futrts_mad16(f16 a, f16 b, f16 c) {
 SCALAR_FUN_ATTR f16 futrts_fma16(f16 a, f16 b, f16 c) {
   return fma(a, b, c);
 }
-#elif ISPC
+#elif defined(ISPC)
 
 SCALAR_FUN_ATTR f16 futrts_log16(f16 x) {
   return futrts_isfinite16(x) || (futrts_isinf16(x) && x < 0) ? log(x) : x;
@@ -664,7 +664,7 @@ SCALAR_FUN_ATTR int16_t futrts_to_bits16(f16 x) {
 SCALAR_FUN_ATTR f16 futrts_from_bits16(int16_t x) {
   return __ushort_as_half(x);
 }
-#elif ISPC
+#elif defined(ISPC)
 
 SCALAR_FUN_ATTR int16_t futrts_to_bits16(f16 x) {
   varying int16_t y = *((varying int16_t * uniform)&x);
@@ -916,7 +916,7 @@ SCALAR_FUN_ATTR double fpconv_f16_f64(f16 x) {
   return (double) x;
 }
 
-#if ISPC
+#if defined(ISPC)
 SCALAR_FUN_ATTR f16 fpconv_f64_f16(double x) {
   return (f16) ((float)x);
 }
diff --git a/rts/c/uniform.h b/rts/c/uniform.h
index cd457e24dc..a9b7bfcc95 100644
--- a/rts/c/uniform.h
+++ b/rts/c/uniform.h
@@ -1,10 +1,9 @@
-
 // Start of uniform.h
 
 // Uniform versions of all library functions as to
 // improve performance in ISPC when in an uniform context.
 
-#if ISPC
+#if defined(ISPC)
 
 static inline uniform uint8_t add8(uniform uint8_t x, uniform uint8_t y) {
   return x + y;
@@ -839,7 +838,7 @@ static inline uniform float fmin32(uniform float x, uniform float y) {
 }
 
 static inline uniform float fpow32(uniform float x, uniform float y) {
-  return __stdlib_powf(x, y);
+  return pow(x, y);
 }
 
 static inline uniform bool futrts_isnan32(uniform float x) {
@@ -1181,7 +1180,7 @@ static inline uniform double fmin64(uniform double x, uniform double y) {
 }
 
 static inline uniform double fpow64(uniform double x, uniform double y) {
-  return __stdlib_powf(x, y);
+  return pow(x, y);
 }
 
 static inline uniform double futrts_log64(uniform double x) {
@@ -1445,7 +1444,7 @@ static inline uniform double fpconv_f16_f64(uniform f16 x) {
 }
 
 static inline uniform f16 fpconv_f64_f16(uniform double x) {
-  return (uniform f16) ((uniform float)x); 
+  return (uniform f16) ((uniform float)x);
 }
 
 #endif
diff --git a/src/Futhark/CodeGen/Backends/MulticoreC.hs b/src/Futhark/CodeGen/Backends/MulticoreC.hs
index a624fe8081..ead3848a85 100644
--- a/src/Futhark/CodeGen/Backends/MulticoreC.hs
+++ b/src/Futhark/CodeGen/Backends/MulticoreC.hs
@@ -138,7 +138,7 @@ compileSetRetvalStructValues struct vnames we = concat $ zipWith field vnames we
   where
     field name (ct, Prim _) =
       [C.cstms|$id:struct.$id:(closureRetvalStructField name)=(($ty:ct*)&$id:name);
-               $escstm:("#if ISPC")
+               $escstm:("#if defined(ISPC)")
                $id:struct.$id:(closureRetvalStructField name)+= programIndex;
                $escstm:("#endif")|]
     field name (_, MemBlock) =
diff --git a/src/Futhark/CodeGen/Backends/MulticoreISPC.hs b/src/Futhark/CodeGen/Backends/MulticoreISPC.hs
index 09b92e5fae..805e16faef 100644
--- a/src/Futhark/CodeGen/Backends/MulticoreISPC.hs
+++ b/src/Futhark/CodeGen/Backends/MulticoreISPC.hs
@@ -803,7 +803,7 @@ compileOp (SegOp name params seq_task par_task retvals (SchedulerInfo e sched))
   aos_name <- newVName "aos"
   GC.items
     [C.citems|
-    $escstm:("#if ISPC")
+    $escstm:("#if defined(ISPC)")
     $tyqual:uniform struct $id:fstruct $id:aos_name[programCount];
     $id:aos_name[programIndex] = $id:(fstruct <> "_");
     $escstm:("foreach_active (i)")
diff --git a/tests/ad/issue1473.fut b/tests/ad/issue1473.fut
index 42f15ae9a6..a8207b757d 100644
--- a/tests/ad/issue1473.fut
+++ b/tests/ad/issue1473.fut
@@ -1,72 +1,69 @@
 -- test mpr sim with ad for params
--- ==
 
 def pi = 3.141592653589793f32
 
 -- some type abbreviations
 type mpr_pars = {G: f32, I: f32, Delta: f32, eta: f32, tau: f32, J: f32}
 type mpr_node = (f32, f32)
-type mpr_net [n] = [n] mpr_node
+type mpr_net [n] = [n]mpr_node
+
 -- this is tranposed from mpr-pdq to avoid tranposes in history update
-type mpr_hist [t] [n] = [t] mpr_net [n]
+type mpr_hist [t] [n] = [t]mpr_net [n]
 type connectome [n] = {weights: [n][n]f32, idelays: [n][n]i64}
 
 -- do one time step w/ Euler
-def mpr_step [t] [n] (now: i64) (dt: f32) (buf: *mpr_hist[t][n]) (conn: connectome[n]) (p: mpr_pars): *mpr_hist[t][n] =
-
-    -- define individual derivatives as in mpr pdq
-    let dr r V = 1/p.tau * ( p.Delta / (pi * p.tau) + 2 * V * r)
-    let dV r V r_c = 1/p.tau * ( V**2 - pi**2 * p.tau**2 * r**2 + p.eta + p.J * p.tau * r + p.I + r_c)
-    let dfun (r, V, c) = (dr r V, dV r V c)
-
-    -- unpack current state for clarity
-    let (r, V) = last buf |> unzip
-
-    -- connectivity eval
-    let r_c_i i w d = map2 (\wj dj -> wj * buf[now - dj, i].0) w d |> reduce (+) 0f32 |> (*p.G)
-    let r_c = map3 r_c_i (iota n) conn.weights conn.idelays
-
-    -- Euler step
-    let erV = map3 (\r V c -> (dr r V, dV r V c)) r V r_c
-           |> map2 (\(r, V) (dr, dV) -> (r + dt * dr, V + dt * dV)) (last buf)
-           |> map1 (\(r, V) -> (if r >= 0f32 then r else 0f32, V))
-
-    -- now for the Heun step
-    let (er, eV) = unzip erV
-    let hrV = map3 (\r V c -> (dr r V, dV r V c)) er eV r_c
-           |> map2 (\(r, V) (dr, dV) -> (r + dt * dr, V + dt * dV)) (last buf)
-           |> map1 (\(r, V) -> (if r >= 0f32 then r else 0f32, V))
-
-    -- return updated buffer
-    in buf with [now + 1] = copy hrV
+def mpr_step [t] [n] (now: i64) (dt: f32) (buf: *mpr_hist [t] [n]) (conn: connectome [n]) (p: mpr_pars) : *mpr_hist [t] [n] =
+  -- define individual derivatives as in mpr pdq
+  let dr r V = 1 / p.tau * (p.Delta / (pi * p.tau) + 2 * V * r)
+  let dV r V r_c = 1 / p.tau * (V ** 2 - pi ** 2 * p.tau ** 2 * r ** 2 + p.eta + p.J * p.tau * r + p.I + r_c)
+  let dfun (r, V, c) = (dr r V, dV r V c)
+  -- unpack current state for clarity
+  let (r, V) = last buf |> unzip
+  -- connectivity eval
+  let r_c_i i w d = map2 (\wj dj -> wj * buf[now - dj, i].0) w d |> reduce (+) 0f32 |> (* p.G)
+  let r_c = map3 r_c_i (iota n) conn.weights conn.idelays
+  -- Euler step
+  let erV =
+    map3 (\r V c -> (dr r V, dV r V c)) r V r_c
+    |> map2 (\(r, V) (dr, dV) -> (r + dt * dr, V + dt * dV)) (last buf)
+    |> map1 (\(r, V) -> (if r >= 0f32 then r else 0f32, V))
+  -- now for the Heun step
+  let (er, eV) = unzip erV
+  let hrV =
+    map3 (\r V c -> (dr r V, dV r V c)) er eV r_c
+    |> map2 (\(r, V) (dr, dV) -> (r + dt * dr, V + dt * dV)) (last buf)
+    |> map1 (\(r, V) -> (if r >= 0f32 then r else 0f32, V))
+  -- return updated buffer
+  in buf with [now + 1] = copy hrV
 
-def run_mpr [t] [n] (horizon: i64) (dt: f32) (buf: mpr_hist[t][n]) (conn: connectome[n]) (p: mpr_pars): mpr_hist[t][n] =
-    loop buf = copy buf
-        for now < (t - horizon - 1) do mpr_step (now + horizon) dt buf conn p
+def run_mpr [t] [n] (horizon: i64) (dt: f32) (buf: mpr_hist [t] [n]) (conn: connectome [n]) (p: mpr_pars) : mpr_hist [t] [n] =
+  loop buf = copy buf
+  for now < (t - horizon - 1) do
+    mpr_step (now + horizon) dt buf conn p
 
-def mpr_pars_with_G (p: mpr_pars) (new_G: f32): mpr_pars =
-    let new_p = copy p
-    in new_p with G = new_G
+def mpr_pars_with_G (p: mpr_pars) (new_G: f32) : mpr_pars =
+  let new_p = copy p
+  in new_p with G = new_G
 
-def loss [t] [n] (x:mpr_hist[t][n]): f32 =
-    let r = map unzip x[t-10:] |> unzip |> (.0)
-    let sum = map (reduce (+) 0f32) r |> reduce (+) 0f32
-    in
-    sum
+def loss [t] [n] (x: mpr_hist [t] [n]) : f32 =
+  let r = map unzip x[t - 10:] |> unzip |> (.0)
+  let sum = map (reduce (+) 0f32) r |> reduce (+) 0f32
+  in sum
 
-def sweep [t] [n] (ng: i64) (horizon: i64) (dt: f32) (buf: mpr_hist[t][n]) (conn: connectome[n]) (p: mpr_pars): [ng]f32 =
-    let Gs = tabulate ng (\i -> 0.0 + (f32.i64 i) * 0.1)
-    let do_one G = run_mpr horizon dt buf conn (mpr_pars_with_G p G) |> loss
-    in map (\g -> vjp do_one g 1f32) Gs
+def sweep [t] [n] (ng: i64) (horizon: i64) (dt: f32) (buf: mpr_hist [t] [n]) (conn: connectome [n]) (p: mpr_pars) : [ng]f32 =
+  let Gs = tabulate ng (\i -> 0.0 + (f32.i64 i) * 0.1)
+  let do_one G = run_mpr horizon dt buf conn (mpr_pars_with_G p G) |> loss
+  in map (\g -> vjp do_one g 1f32) Gs
 
 -- ==
--- compiled input { 1i64 5i64 10i64 7i64 }
+-- no_ispc compiled input { 1i64 5i64 10i64 7i64 }
 -- output { [0.000086f32] }
 def main (ng: i64) (nh: i64) (nt: i64) (nn: i64) =
-    let dt = 0.01f32
-    let buf = tabulate_2d (nt + nh) nn (\i j -> (0.1f32, -2.0f32))
-    let conn = {weights=tabulate_2d nn nn (\i j -> 0.1f32),
-                idelays=tabulate_2d nn nn (\i j -> ((i * j) % nh))
-                }
-    let p = {G=0.1f32, I=0.0f32, Delta=0.7f32, eta=(-4.6f32), tau=1.0f32, J=14.5f32}
-    in sweep ng nh dt buf conn p
+  let dt = 0.01f32
+  let buf = tabulate_2d (nt + nh) nn (\i j -> (0.1f32, -2.0f32))
+  let conn =
+    { weights = tabulate_2d nn nn (\i j -> 0.1f32)
+    , idelays = tabulate_2d nn nn (\i j -> ((i * j) % nh))
+    }
+  let p = {G = 0.1f32, I = 0.0f32, Delta = 0.7f32, eta = (-4.6f32), tau = 1.0f32, J = 14.5f32}
+  in sweep ng nh dt buf conn p
diff --git a/tests/tiling/tiling_1d_complex.fut b/tests/tiling/tiling_1d_complex.fut
index 79b78730d4..fa9641b533 100644
--- a/tests/tiling/tiling_1d_complex.fut
+++ b/tests/tiling/tiling_1d_complex.fut
@@ -1,24 +1,26 @@
 -- More stuff that can go wrong with a larger tiling prelude, but
 -- still just 1D tiling.
 -- ==
--- compiled random input { [2000]f32 [2000]f32 } auto output
+-- no_ispc compiled random input { [2000]f32 [2000]f32 } auto output
 -- structure gpu { SegMap/Loop/SegMap 2 }
 
-type point = (f32,f32)
+type point = (f32, f32)
 
-def add_points ((x1,y1): point) ((x2,y2): point): point =
-  (x1+x2, y1+y2)
+def add_points ((x1, y1): point) ((x2, y2): point) : point =
+  (x1 + x2, y1 + y2)
 
-def euclid_dist_2 ((x1,y1): point) ((x2,y2): point): f32 =
-  (x2-x1)**2.0f32 + (y2-y1)**2.0f32
+def euclid_dist_2 ((x1, y1): point) ((x2, y2): point) : f32 =
+  (x2 - x1) ** 2.0f32 + (y2 - y1) ** 2.0f32
 
-def closest_point (p1: (i32, f32)) (p2: (i32, f32)): (i32, f32) =
+def closest_point (p1: (i32, f32)) (p2: (i32, f32)) : (i32, f32) =
   if p1.1 < p2.1 then p1 else p2
 
-def find_nearest_point [k] (pts: [k]point) (pt: point): i32 =
-  let (i, _) = reduce_comm closest_point (0, euclid_dist_2 pt pts[0])
-                           (zip (map i32.i64 (iota k))
-                                (map (euclid_dist_2 pt) pts))
+def find_nearest_point [k] (pts: [k]point) (pt: point) : i32 =
+  let (i, _) =
+    reduce_comm closest_point
+                (0, euclid_dist_2 pt pts[0])
+                (zip (map i32.i64 (iota k))
+                     (map (euclid_dist_2 pt) pts))
   in i
 
 def main [n] (xs: [n]f32) (ys: [n]f32) =