From 1bc62e7114da9bcb68027bf050e418e292aa138f Mon Sep 17 00:00:00 2001 From: Troels Henriksen Date: Tue, 4 Feb 2025 08:31:37 +0100 Subject: [PATCH] Support newer ISPCs. I also had to remove some stuff that caused ISPC to crash, and a couple of tests also fail now. This backend is a bit rickety as it does not see much maintenance. --- CHANGELOG.md | 2 + rts/c/ispc_util.h | 22 ++-- rts/c/scalar.h | 26 ++--- rts/c/scalar_f16.h | 12 +- rts/c/uniform.h | 9 +- src/Futhark/CodeGen/Backends/MulticoreC.hs | 2 +- src/Futhark/CodeGen/Backends/MulticoreISPC.hs | 2 +- tests/ad/issue1473.fut | 103 +++++++++--------- tests/tiling/tiling_1d_complex.fut | 24 ++-- 9 files changed, 101 insertions(+), 101 deletions(-) diff --git a/CHANGELOG.md b/CHANGELOG.md index 334bbbfba8..f5607ceab0 100644 --- a/CHANGELOG.md +++ b/CHANGELOG.md @@ -35,6 +35,8 @@ and this project adheres to [Semantic Versioning](http://semver.org/spec/v2.0.0. * A bug in the "sink" optimisation pass could cause compiler crashes. +* Compile errors with newer versions of `ispc`. + ## [0.25.26] ### Fixed diff --git a/rts/c/ispc_util.h b/rts/c/ispc_util.h index 7a5b53ff7a..45224b7839 100644 --- a/rts/c/ispc_util.h +++ b/rts/c/ispc_util.h @@ -27,17 +27,17 @@ make_extract(uint64) make_extract(float16) make_extract(float) make_extract(double) -make_extract(int8* uniform) -make_extract(int16* uniform) -make_extract(int32* uniform) -make_extract(int64* uniform) -make_extract(uint8* uniform) -make_extract(uint16* uniform) -make_extract(uint32* uniform) -make_extract(uint64* uniform) -make_extract(float16* uniform) -make_extract(float* uniform) -make_extract(double* uniform) +/* make_extract(int8* uniform) */ +/* make_extract(int16* uniform) */ +/* make_extract(int32* uniform) */ +/* make_extract(int64* uniform) */ +/* make_extract(uint8* uniform) */ +/* make_extract(uint16* uniform) */ +/* make_extract(uint32* uniform) */ +/* make_extract(uint64* uniform) */ +/* make_extract(float16* uniform) */ +/* make_extract(float* uniform) */ +/* make_extract(double* uniform) */ make_extract(struct futhark_context) make_extract(struct memblock) diff --git a/rts/c/scalar.h b/rts/c/scalar.h index 7435374a97..0e7caf08fc 100644 --- a/rts/c/scalar.h +++ b/rts/c/scalar.h @@ -68,7 +68,7 @@ SCALAR_FUN_ATTR uint64_t mul64(uint64_t x, uint64_t y) { return x * y; } -#if ISPC +#if defined(ISPC) SCALAR_FUN_ATTR uint8_t udiv8(uint8_t x, uint8_t y) { // This strange pattern is used to prevent the ISPC compiler from @@ -1309,7 +1309,7 @@ SCALAR_FUN_ATTR uint8_t futrts_smul_hi8 ( int8_t a, int8_t b) { return ((int16_ SCALAR_FUN_ATTR uint16_t futrts_smul_hi16(int16_t a, int16_t b) { return ((int32_t)a) * ((int32_t)b) >> 16; } SCALAR_FUN_ATTR uint32_t futrts_smul_hi32(int32_t a, int32_t b) { return __mulhi(a, b); } SCALAR_FUN_ATTR uint64_t futrts_smul_hi64(int64_t a, int64_t b) { return __mul64hi(a, b); } -#elif ISPC +#elif defined(ISPC) SCALAR_FUN_ATTR uint8_t futrts_umul_hi8(uint8_t a, uint8_t b) { return ((uint16_t)a) * ((uint16_t)b) >> 8; } SCALAR_FUN_ATTR uint16_t futrts_umul_hi16(uint16_t a, uint16_t b) { return ((uint32_t)a) * ((uint32_t)b) >> 16; } SCALAR_FUN_ATTR uint32_t futrts_umul_hi32(uint32_t a, uint32_t b) { return ((uint64_t)a) * ((uint64_t)b) >> 32; } @@ -1430,7 +1430,7 @@ SCALAR_FUN_ATTR int32_t futrts_clzz64(int64_t x) { return __clzll(x); } -#elif ISPC +#elif defined(ISPC) SCALAR_FUN_ATTR int32_t futrts_clzz8(int8_t x) { return count_leading_zeros((int32_t)(uint8_t)x)-24; @@ -1518,7 +1518,7 @@ SCALAR_FUN_ATTR int32_t futrts_ctzz64(int64_t x) { return y == 0 ? 64 : y - 1; } -#elif ISPC +#elif defined(ISPC) SCALAR_FUN_ATTR int32_t futrts_ctzz8(int8_t x) { return x == 0 ? 8 : count_trailing_zeros((int32_t)x); @@ -1628,7 +1628,7 @@ SCALAR_FUN_ATTR float fpow32(float x, float y) { return pow(x, y); } -#elif ISPC +#elif defined(ISPC) SCALAR_FUN_ATTR float fabs32(float x) { return abs(x); @@ -1645,7 +1645,7 @@ SCALAR_FUN_ATTR float fmin32(float x, float y) { SCALAR_FUN_ATTR float fpow32(float a, float b) { float ret; foreach_active (i) { - uniform float r = __stdlib_powf(extract(a, i), extract(b, i)); + uniform float r = pow(extract(a, i), extract(b, i)); ret = insert(ret, i, r); } return ret; @@ -1674,7 +1674,7 @@ SCALAR_FUN_ATTR bool futrts_isnan32(float x) { return isnan(x); } -#if ISPC +#if defined(ISPC) SCALAR_FUN_ATTR bool futrts_isinf32(float x) { return !isnan(x) && isnan(x - x); @@ -1905,7 +1905,7 @@ SCALAR_FUN_ATTR float futrts_fma32(float a, float b, float c) { return fma(a, b, c); } -#elif ISPC +#elif defined(ISPC) SCALAR_FUN_ATTR float futrts_log32(float x) { return futrts_isfinite32(x) || (futrts_isinf32(x) && x < 0)? log(x) : x; @@ -2107,7 +2107,7 @@ SCALAR_FUN_ATTR float futrts_lerp32(float v0, float v1, float t) { } SCALAR_FUN_ATTR float futrts_ldexp32(float x, int32_t y) { - return x * pow((double)2.0, (double)y); + return x * pow((uniform float)2.0, (float)y); } SCALAR_FUN_ATTR float futrts_copysign32(float x, float y) { @@ -2267,7 +2267,7 @@ SCALAR_FUN_ATTR float futrts_fma32(float a, float b, float c) { } #endif -#if ISPC +#if defined(ISPC) SCALAR_FUN_ATTR int32_t futrts_to_bits32(float x) { return intbits(x); } @@ -2306,7 +2306,7 @@ SCALAR_FUN_ATTR float fsignum32(float x) { SCALAR_FUN_ATTR double futrts_from_bits64(int64_t x); SCALAR_FUN_ATTR int64_t futrts_to_bits64(double x); -#if ISPC +#if defined(ISPC) SCALAR_FUN_ATTR bool futrts_isinf64(float x) { return !isnan(x) && isnan(x - x); } @@ -2386,7 +2386,7 @@ SCALAR_FUN_ATTR double fmin64(double x, double y) { SCALAR_FUN_ATTR double fpow64(double a, double b) { float ret; foreach_active (i) { - uniform float r = __stdlib_powf(extract(a, i), extract(b, i)); + uniform float r = pow(extract(a, i), extract(b, i)); ret = insert(ret, i, r); } return ret; @@ -2673,7 +2673,7 @@ SCALAR_FUN_ATTR double futrts_lerp64(double v0, double v1, double t) { } SCALAR_FUN_ATTR double futrts_ldexp64(double x, int32_t y) { - return x * pow((double)2.0, (double)y); + return x * pow((uniform double)2.0, (double)y); } SCALAR_FUN_ATTR double futrts_copysign64(double x, double y) { diff --git a/rts/c/scalar_f16.h b/rts/c/scalar_f16.h index f6387c2c0b..eb20c7806e 100644 --- a/rts/c/scalar_f16.h +++ b/rts/c/scalar_f16.h @@ -23,7 +23,7 @@ // compiler will have to be real careful! typedef float f16; -#elif ISPC +#elif defined(ISPC) typedef float16 f16; #else @@ -154,7 +154,7 @@ SCALAR_FUN_ATTR f16 fpow16(f16 x, f16 y) { return pow(x, y); } -#elif ISPC +#elif defined(ISPC) SCALAR_FUN_ATTR f16 fabs16(f16 x) { return abs(x); } @@ -190,7 +190,7 @@ SCALAR_FUN_ATTR f16 fpow16(f16 x, f16 y) { } #endif -#if ISPC +#if defined(ISPC) SCALAR_FUN_ATTR bool futrts_isinf16(float x) { return !futrts_isnan16(x) && futrts_isnan16(x - x); } @@ -345,7 +345,7 @@ SCALAR_FUN_ATTR f16 futrts_mad16(f16 a, f16 b, f16 c) { SCALAR_FUN_ATTR f16 futrts_fma16(f16 a, f16 b, f16 c) { return fma(a, b, c); } -#elif ISPC +#elif defined(ISPC) SCALAR_FUN_ATTR f16 futrts_log16(f16 x) { return futrts_isfinite16(x) || (futrts_isinf16(x) && x < 0) ? log(x) : x; @@ -664,7 +664,7 @@ SCALAR_FUN_ATTR int16_t futrts_to_bits16(f16 x) { SCALAR_FUN_ATTR f16 futrts_from_bits16(int16_t x) { return __ushort_as_half(x); } -#elif ISPC +#elif defined(ISPC) SCALAR_FUN_ATTR int16_t futrts_to_bits16(f16 x) { varying int16_t y = *((varying int16_t * uniform)&x); @@ -916,7 +916,7 @@ SCALAR_FUN_ATTR double fpconv_f16_f64(f16 x) { return (double) x; } -#if ISPC +#if defined(ISPC) SCALAR_FUN_ATTR f16 fpconv_f64_f16(double x) { return (f16) ((float)x); } diff --git a/rts/c/uniform.h b/rts/c/uniform.h index cd457e24dc..a9b7bfcc95 100644 --- a/rts/c/uniform.h +++ b/rts/c/uniform.h @@ -1,10 +1,9 @@ - // Start of uniform.h // Uniform versions of all library functions as to // improve performance in ISPC when in an uniform context. -#if ISPC +#if defined(ISPC) static inline uniform uint8_t add8(uniform uint8_t x, uniform uint8_t y) { return x + y; @@ -839,7 +838,7 @@ static inline uniform float fmin32(uniform float x, uniform float y) { } static inline uniform float fpow32(uniform float x, uniform float y) { - return __stdlib_powf(x, y); + return pow(x, y); } static inline uniform bool futrts_isnan32(uniform float x) { @@ -1181,7 +1180,7 @@ static inline uniform double fmin64(uniform double x, uniform double y) { } static inline uniform double fpow64(uniform double x, uniform double y) { - return __stdlib_powf(x, y); + return pow(x, y); } static inline uniform double futrts_log64(uniform double x) { @@ -1445,7 +1444,7 @@ static inline uniform double fpconv_f16_f64(uniform f16 x) { } static inline uniform f16 fpconv_f64_f16(uniform double x) { - return (uniform f16) ((uniform float)x); + return (uniform f16) ((uniform float)x); } #endif diff --git a/src/Futhark/CodeGen/Backends/MulticoreC.hs b/src/Futhark/CodeGen/Backends/MulticoreC.hs index a624fe8081..ead3848a85 100644 --- a/src/Futhark/CodeGen/Backends/MulticoreC.hs +++ b/src/Futhark/CodeGen/Backends/MulticoreC.hs @@ -138,7 +138,7 @@ compileSetRetvalStructValues struct vnames we = concat $ zipWith field vnames we where field name (ct, Prim _) = [C.cstms|$id:struct.$id:(closureRetvalStructField name)=(($ty:ct*)&$id:name); - $escstm:("#if ISPC") + $escstm:("#if defined(ISPC)") $id:struct.$id:(closureRetvalStructField name)+= programIndex; $escstm:("#endif")|] field name (_, MemBlock) = diff --git a/src/Futhark/CodeGen/Backends/MulticoreISPC.hs b/src/Futhark/CodeGen/Backends/MulticoreISPC.hs index 09b92e5fae..805e16faef 100644 --- a/src/Futhark/CodeGen/Backends/MulticoreISPC.hs +++ b/src/Futhark/CodeGen/Backends/MulticoreISPC.hs @@ -803,7 +803,7 @@ compileOp (SegOp name params seq_task par_task retvals (SchedulerInfo e sched)) aos_name <- newVName "aos" GC.items [C.citems| - $escstm:("#if ISPC") + $escstm:("#if defined(ISPC)") $tyqual:uniform struct $id:fstruct $id:aos_name[programCount]; $id:aos_name[programIndex] = $id:(fstruct <> "_"); $escstm:("foreach_active (i)") diff --git a/tests/ad/issue1473.fut b/tests/ad/issue1473.fut index 42f15ae9a6..a8207b757d 100644 --- a/tests/ad/issue1473.fut +++ b/tests/ad/issue1473.fut @@ -1,72 +1,69 @@ -- test mpr sim with ad for params --- == def pi = 3.141592653589793f32 -- some type abbreviations type mpr_pars = {G: f32, I: f32, Delta: f32, eta: f32, tau: f32, J: f32} type mpr_node = (f32, f32) -type mpr_net [n] = [n] mpr_node +type mpr_net [n] = [n]mpr_node + -- this is tranposed from mpr-pdq to avoid tranposes in history update -type mpr_hist [t] [n] = [t] mpr_net [n] +type mpr_hist [t] [n] = [t]mpr_net [n] type connectome [n] = {weights: [n][n]f32, idelays: [n][n]i64} -- do one time step w/ Euler -def mpr_step [t] [n] (now: i64) (dt: f32) (buf: *mpr_hist[t][n]) (conn: connectome[n]) (p: mpr_pars): *mpr_hist[t][n] = - - -- define individual derivatives as in mpr pdq - let dr r V = 1/p.tau * ( p.Delta / (pi * p.tau) + 2 * V * r) - let dV r V r_c = 1/p.tau * ( V**2 - pi**2 * p.tau**2 * r**2 + p.eta + p.J * p.tau * r + p.I + r_c) - let dfun (r, V, c) = (dr r V, dV r V c) - - -- unpack current state for clarity - let (r, V) = last buf |> unzip - - -- connectivity eval - let r_c_i i w d = map2 (\wj dj -> wj * buf[now - dj, i].0) w d |> reduce (+) 0f32 |> (*p.G) - let r_c = map3 r_c_i (iota n) conn.weights conn.idelays - - -- Euler step - let erV = map3 (\r V c -> (dr r V, dV r V c)) r V r_c - |> map2 (\(r, V) (dr, dV) -> (r + dt * dr, V + dt * dV)) (last buf) - |> map1 (\(r, V) -> (if r >= 0f32 then r else 0f32, V)) - - -- now for the Heun step - let (er, eV) = unzip erV - let hrV = map3 (\r V c -> (dr r V, dV r V c)) er eV r_c - |> map2 (\(r, V) (dr, dV) -> (r + dt * dr, V + dt * dV)) (last buf) - |> map1 (\(r, V) -> (if r >= 0f32 then r else 0f32, V)) - - -- return updated buffer - in buf with [now + 1] = copy hrV +def mpr_step [t] [n] (now: i64) (dt: f32) (buf: *mpr_hist [t] [n]) (conn: connectome [n]) (p: mpr_pars) : *mpr_hist [t] [n] = + -- define individual derivatives as in mpr pdq + let dr r V = 1 / p.tau * (p.Delta / (pi * p.tau) + 2 * V * r) + let dV r V r_c = 1 / p.tau * (V ** 2 - pi ** 2 * p.tau ** 2 * r ** 2 + p.eta + p.J * p.tau * r + p.I + r_c) + let dfun (r, V, c) = (dr r V, dV r V c) + -- unpack current state for clarity + let (r, V) = last buf |> unzip + -- connectivity eval + let r_c_i i w d = map2 (\wj dj -> wj * buf[now - dj, i].0) w d |> reduce (+) 0f32 |> (* p.G) + let r_c = map3 r_c_i (iota n) conn.weights conn.idelays + -- Euler step + let erV = + map3 (\r V c -> (dr r V, dV r V c)) r V r_c + |> map2 (\(r, V) (dr, dV) -> (r + dt * dr, V + dt * dV)) (last buf) + |> map1 (\(r, V) -> (if r >= 0f32 then r else 0f32, V)) + -- now for the Heun step + let (er, eV) = unzip erV + let hrV = + map3 (\r V c -> (dr r V, dV r V c)) er eV r_c + |> map2 (\(r, V) (dr, dV) -> (r + dt * dr, V + dt * dV)) (last buf) + |> map1 (\(r, V) -> (if r >= 0f32 then r else 0f32, V)) + -- return updated buffer + in buf with [now + 1] = copy hrV -def run_mpr [t] [n] (horizon: i64) (dt: f32) (buf: mpr_hist[t][n]) (conn: connectome[n]) (p: mpr_pars): mpr_hist[t][n] = - loop buf = copy buf - for now < (t - horizon - 1) do mpr_step (now + horizon) dt buf conn p +def run_mpr [t] [n] (horizon: i64) (dt: f32) (buf: mpr_hist [t] [n]) (conn: connectome [n]) (p: mpr_pars) : mpr_hist [t] [n] = + loop buf = copy buf + for now < (t - horizon - 1) do + mpr_step (now + horizon) dt buf conn p -def mpr_pars_with_G (p: mpr_pars) (new_G: f32): mpr_pars = - let new_p = copy p - in new_p with G = new_G +def mpr_pars_with_G (p: mpr_pars) (new_G: f32) : mpr_pars = + let new_p = copy p + in new_p with G = new_G -def loss [t] [n] (x:mpr_hist[t][n]): f32 = - let r = map unzip x[t-10:] |> unzip |> (.0) - let sum = map (reduce (+) 0f32) r |> reduce (+) 0f32 - in - sum +def loss [t] [n] (x: mpr_hist [t] [n]) : f32 = + let r = map unzip x[t - 10:] |> unzip |> (.0) + let sum = map (reduce (+) 0f32) r |> reduce (+) 0f32 + in sum -def sweep [t] [n] (ng: i64) (horizon: i64) (dt: f32) (buf: mpr_hist[t][n]) (conn: connectome[n]) (p: mpr_pars): [ng]f32 = - let Gs = tabulate ng (\i -> 0.0 + (f32.i64 i) * 0.1) - let do_one G = run_mpr horizon dt buf conn (mpr_pars_with_G p G) |> loss - in map (\g -> vjp do_one g 1f32) Gs +def sweep [t] [n] (ng: i64) (horizon: i64) (dt: f32) (buf: mpr_hist [t] [n]) (conn: connectome [n]) (p: mpr_pars) : [ng]f32 = + let Gs = tabulate ng (\i -> 0.0 + (f32.i64 i) * 0.1) + let do_one G = run_mpr horizon dt buf conn (mpr_pars_with_G p G) |> loss + in map (\g -> vjp do_one g 1f32) Gs -- == --- compiled input { 1i64 5i64 10i64 7i64 } +-- no_ispc compiled input { 1i64 5i64 10i64 7i64 } -- output { [0.000086f32] } def main (ng: i64) (nh: i64) (nt: i64) (nn: i64) = - let dt = 0.01f32 - let buf = tabulate_2d (nt + nh) nn (\i j -> (0.1f32, -2.0f32)) - let conn = {weights=tabulate_2d nn nn (\i j -> 0.1f32), - idelays=tabulate_2d nn nn (\i j -> ((i * j) % nh)) - } - let p = {G=0.1f32, I=0.0f32, Delta=0.7f32, eta=(-4.6f32), tau=1.0f32, J=14.5f32} - in sweep ng nh dt buf conn p + let dt = 0.01f32 + let buf = tabulate_2d (nt + nh) nn (\i j -> (0.1f32, -2.0f32)) + let conn = + { weights = tabulate_2d nn nn (\i j -> 0.1f32) + , idelays = tabulate_2d nn nn (\i j -> ((i * j) % nh)) + } + let p = {G = 0.1f32, I = 0.0f32, Delta = 0.7f32, eta = (-4.6f32), tau = 1.0f32, J = 14.5f32} + in sweep ng nh dt buf conn p diff --git a/tests/tiling/tiling_1d_complex.fut b/tests/tiling/tiling_1d_complex.fut index 79b78730d4..fa9641b533 100644 --- a/tests/tiling/tiling_1d_complex.fut +++ b/tests/tiling/tiling_1d_complex.fut @@ -1,24 +1,26 @@ -- More stuff that can go wrong with a larger tiling prelude, but -- still just 1D tiling. -- == --- compiled random input { [2000]f32 [2000]f32 } auto output +-- no_ispc compiled random input { [2000]f32 [2000]f32 } auto output -- structure gpu { SegMap/Loop/SegMap 2 } -type point = (f32,f32) +type point = (f32, f32) -def add_points ((x1,y1): point) ((x2,y2): point): point = - (x1+x2, y1+y2) +def add_points ((x1, y1): point) ((x2, y2): point) : point = + (x1 + x2, y1 + y2) -def euclid_dist_2 ((x1,y1): point) ((x2,y2): point): f32 = - (x2-x1)**2.0f32 + (y2-y1)**2.0f32 +def euclid_dist_2 ((x1, y1): point) ((x2, y2): point) : f32 = + (x2 - x1) ** 2.0f32 + (y2 - y1) ** 2.0f32 -def closest_point (p1: (i32, f32)) (p2: (i32, f32)): (i32, f32) = +def closest_point (p1: (i32, f32)) (p2: (i32, f32)) : (i32, f32) = if p1.1 < p2.1 then p1 else p2 -def find_nearest_point [k] (pts: [k]point) (pt: point): i32 = - let (i, _) = reduce_comm closest_point (0, euclid_dist_2 pt pts[0]) - (zip (map i32.i64 (iota k)) - (map (euclid_dist_2 pt) pts)) +def find_nearest_point [k] (pts: [k]point) (pt: point) : i32 = + let (i, _) = + reduce_comm closest_point + (0, euclid_dist_2 pt pts[0]) + (zip (map i32.i64 (iota k)) + (map (euclid_dist_2 pt) pts)) in i def main [n] (xs: [n]f32) (ys: [n]f32) =