Merge pull request #178 from jtkrogel/offload_spline_v

Add spline evaluate_v offload
QMCPACK · Sep 13, 2018 · bd7e78a · bd7e78a
2 parents 15e8597 + 2370e5f
commit bd7e78a
Show file tree

Hide file tree

Showing 2 changed files with 89 additions and 2 deletions.
diff --git a/src/Numerics/Spline2/MultiBsplineOffload.hpp b/src/Numerics/Spline2/MultiBsplineOffload.hpp
@@ -48,6 +48,9 @@ template <typename T> struct MultiBsplineOffload
   static void evaluate_v(const spliner_type *restrict spline_m, T x, T y, T z, T *restrict vals,
                   size_t num_splines);
 
+  static void evaluate_v_v2(const spliner_type *restrict spline_m, T x, T y, T z, T *restrict vals,
+                  size_t num_splines);
+
   static void evaluate_vgl(const spliner_type *restrict spline_m, T x, T y, T z, T *restrict vals, T *restrict grads,
                     T *restrict lapl, size_t num_splines);
 
@@ -100,6 +103,53 @@ inline void MultiBsplineOffload<T>::evaluate_v(const spliner_type *restrict spli
     }
 }
 
+template <typename T>
+inline void MultiBsplineOffload<T>::evaluate_v_v2(const spliner_type *restrict spline_m,
+                                           T x, T y, T z, T *restrict vals,
+                                           size_t num_splines)
+{
+  x -= spline_m->x_grid.start;
+  y -= spline_m->y_grid.start;
+  z -= spline_m->z_grid.start;
+  T tx, ty, tz;
+  int ix, iy, iz;
+  SplineBound<T>::get(x * spline_m->x_grid.delta_inv, tx, ix,
+                      spline_m->x_grid.num - 1);
+  SplineBound<T>::get(y * spline_m->y_grid.delta_inv, ty, iy,
+                      spline_m->y_grid.num - 1);
+  SplineBound<T>::get(z * spline_m->z_grid.delta_inv, tz, iz,
+                      spline_m->z_grid.num - 1);
+  T a[4], b[4], c[4];
+
+  MultiBsplineData<T>::compute_prefactors(a, tx);
+  MultiBsplineData<T>::compute_prefactors(b, ty);
+  MultiBsplineData<T>::compute_prefactors(c, tz);
+
+  const intptr_t xs = spline_m->x_stride;
+  const intptr_t ys = spline_m->y_stride;
+  const intptr_t zs = spline_m->z_stride;
+
+#ifdef ENABLE_OFFLOAD
+  #pragma omp for nowait
+#else
+  #pragma omp simd aligned(vals)
+#endif
+  for (size_t n = 0; n < num_splines; n++)
+  {
+    T val = T();
+    for (size_t i = 0; i < 4; i++)
+      for (size_t j = 0; j < 4; j++)
+        {
+          const T *restrict coefs =
+            spline_m->coefs + (ix + i) * xs + (iy + j) * ys + iz * zs;
+          val += a[i] * b[j] * 
+            (c[0] * coefs[n] + c[1] * coefs[n + zs] +
+             c[2] * coefs[n + 2 * zs] + c[3] * coefs[n + 3 * zs]);
+        }
+    vals[n] = val;
+  }
+}
+
 template <typename T>
 inline void
 MultiBsplineOffload<T>::evaluate_vgl(const spliner_type *restrict spline_m,

diff --git a/src/QMCWaveFunctions/einspline_spo.hpp b/src/QMCWaveFunctions/einspline_spo.hpp
@@ -199,9 +199,46 @@ struct einspline_spo : public SPOSet
   {
     ScopedTimer local_timer(timer);
 
-    auto u = Lattice.toUnit_floor(p);
+    if (nBlocks != psi_shadows.size())
+    {
+      psi_shadows.resize(nBlocks);
+
+      T** restrict psi_shadows_ptr  = psi_shadows.data();
+      for (int i = 0; i < nBlocks; ++i)
+      {
+        T* restrict psi_ptr  = psi[i].data();
+#ifdef ENABLE_OFFLOAD
+        #pragma omp target map(to : i) device(0)
+#endif
+        {
+          psi_shadows_ptr[i]  = psi_ptr;
+        }
+      }
+    }
+
+    OMPTinyVector<T, 3> u = Lattice.toUnit_floor(p);
+
+    T** restrict psi_shadows_ptr          = psi_shadows.data();
+    spline_type** restrict einsplines_ptr = einsplines.data();
+
+#ifdef ENABLE_OFFLOAD
+    #pragma omp target teams distribute num_teams(nBlocks) device(0) \
+    map(to : nBlocks, nSplinesPerBlock) map(always, to : u)
+#else
+    #pragma omp parallel for
+#endif
     for (int i = 0; i < nBlocks; ++i)
-      compute_engine.evaluate_v(einsplines[i], u[0], u[1], u[2], psi[i].data(), nSplinesPerBlock);
+    {
+#ifdef ENABLE_OFFLOAD
+      #pragma omp parallel num_threads(nSplinesPerBlock)
+#endif
+      MultiBsplineOffload<T>::evaluate_v_v2(einsplines_ptr[i],
+                                            u[0],
+                                            u[1],
+                                            u[2],
+                                            psi_shadows_ptr[i],
+                                            nSplinesPerBlock);
+    }
   }
 
   /** evaluate psi */