remove dependency to native library for ConjugateGradient

implement scalar prod in c#
hzhou210 · Feb 20, 2019 · bb0fd2b · bb0fd2b
1 parent e3efa70
commit bb0fd2b
Show file tree

Hide file tree

Showing 2 changed files with 48 additions and 24 deletions.
diff --git a/...BasicSamples_CUDA100/3.Maths/ConjugateGradient/ConjugateGradient/ConjugateGradient.csproj b/...BasicSamples_CUDA100/3.Maths/ConjugateGradient/ConjugateGradient/ConjugateGradient.csproj
@@ -1,4 +1,4 @@
-<?xml version="1.0" encoding="utf-8"?>
+<?xml version="1.0" encoding="utf-8"?>
 <Project ToolsVersion="4.0" DefaultTargets="Build" xmlns="http://schemas.microsoft.com/developer/msbuild/2003">
   <Import Project="$(MSBuildExtensionsPath)\$(MSBuildToolsVersion)\Microsoft.Common.props" Condition="Exists('$(MSBuildExtensionsPath)\$(MSBuildToolsVersion)\Microsoft.Common.props')" />
   <PropertyGroup>
@@ -54,12 +54,6 @@
   <ItemGroup>
     <None Include="App.config" />
   </ItemGroup>
-  <ItemGroup>
-    <Content Include="..\..\..\x64\$(Configuration)\hybridizer_cuda_reduction.dll">
-      <Link>hybridizer_cuda_reduction.dll</Link>
-      <CopyToOutputDirectory>PreserveNewest</CopyToOutputDirectory>
-    </Content>
-  </ItemGroup>
   <Import Project="$(MSBuildToolsPath)\Microsoft.CSharp.targets" />
   <!-- To modify your build process, add your task inside one of the targets below and uncomment it. 
        Other similar extension points exist, see Microsoft.Common.targets.
@@ -68,4 +62,4 @@
   <Target Name="AfterBuild">
   </Target>
   -->
-</Project>
+</Project>
diff --git a/HybridizerBasicSamples_CUDA100/3.Maths/ConjugateGradient/ConjugateGradient/Program.cs b/HybridizerBasicSamples_CUDA100/3.Maths/ConjugateGradient/ConjugateGradient/Program.cs
@@ -16,28 +16,29 @@ static void Main(string[] args)
             // configure CUDA
             cudaDeviceProp prop;
             cuda.GetDeviceProperties(out prop, 0);
-            runner = HybRunner.Cuda().SetDistrib(prop.multiProcessorCount * 16, 128);
+            runner = HybRunner.Cuda().SetDistrib(prop.multiProcessorCount * 16, 1, 128, 1, 1, 128 * sizeof(float));
             wrapper = runner.Wrap(new Program());
 
-            int size = 1000000; // very slow convergence with no preconditioner
+            int size = 10000; // very slow convergence (or even not at all) with no preconditioner
             SparseMatrix A = SparseMatrix.Laplacian_1D(size);
             FloatResidentArray B = new FloatResidentArray(size);
             FloatResidentArray X = new FloatResidentArray(size);
 
-            int maxiter = 1000;
+            int maxiter = 1000000;
             float eps = 1.0e-09f;
 
             for (int i = 0; i < size; ++i)
             {
                 B[i] = 1.0f; // right side
                 X[i] = 0.0f; // starting point
             }
-
+            
             ConjugateGradient(X, A, B, maxiter, eps);
         }
 
         public static void ConjugateGradient(FloatResidentArray X, SparseMatrix A, FloatResidentArray B, int maxiter, float eps)
         {
+            float[] scalBuf = new float[1] { 0 };
             int N = (int) B.Count;
             FloatResidentArray R = new FloatResidentArray(N);
             FloatResidentArray P = new FloatResidentArray(N);
@@ -52,11 +53,13 @@ public static void ConjugateGradient(FloatResidentArray X, SparseMatrix A, Float
             while(k < maxiter)
             {
                 wrapper.Multiply(AP, A, P, N);                  // AP = A*P
-                float r = ScalarProd(R, R, N);                  // save <R|R>
-                float alpha = r / ScalarProd(P, AP, N);         // alpha = <R|R> / <P|AP>
+                scalBuf[0] = 0; wrapper.ScalarProd(scalBuf, R, R, N); float r = scalBuf[0];               // save <R|R>
+                scalBuf[0] = 0; wrapper.ScalarProd(scalBuf, P, AP, N); float alpha = r / scalBuf[0];       // alpha = <R|R> / <P|AP>
                 wrapper.Saxpy(X, X, alpha, P, N);               // X = X - alpha*P
                 wrapper.Saxpy(R, R, -alpha, AP, N);             // RR = R-alpha*AP
-                float rr = ScalarProd(R, R, N);
+                scalBuf[0] = 0; wrapper.ScalarProd(scalBuf, R, R, N); float rr = scalBuf[0];
+                if (k % 10 == 0)
+                    Console.WriteLine(rr);
                 if(rr < eps*eps)
                 {
                     break;
@@ -70,6 +73,42 @@ public static void ConjugateGradient(FloatResidentArray X, SparseMatrix A, Float
             X.RefreshHost();
         }
 
+        [EntryPoint]
+        private static void ScalarProd(float[] result, FloatResidentArray r1, FloatResidentArray r2, int N)
+        {
+            var cache = new SharedMemoryAllocator<float>().allocate(blockDim.x);
+            int tid = threadIdx.x + blockDim.x * blockIdx.x;
+            int cacheIndex = threadIdx.x;
+
+            float tmp = 0.0F;
+            while (tid < N)
+            {
+                tmp += r1[tid] * r2[tid];
+                tid += blockDim.x * gridDim.x;
+            }
+
+            cache[cacheIndex] = tmp;
+
+            CUDAIntrinsics.__syncthreads();
+
+            int i = blockDim.x / 2;
+            while (i != 0)
+            {
+                if (cacheIndex < i)
+                {
+                    cache[cacheIndex] += cache[cacheIndex + i];
+                }
+
+                CUDAIntrinsics.__syncthreads();
+                i >>= 1;
+            }
+
+            if (cacheIndex == 0)
+            {
+                AtomicExpr.apply(ref result[0], cache[0], (x, y) => x + y);
+            }
+        }
+
         [EntryPoint]
         public static void Copy(FloatResidentArray res, FloatResidentArray src, int N)
         {
@@ -88,15 +127,6 @@ public static void Saxpy(FloatResidentArray res, FloatResidentArray x, float alp
             });
         }
 
-
-        public static float ScalarProd(FloatResidentArray X, FloatResidentArray Y, int N)
-        {
-            return inner_scalar_prod((float*) X.DevicePointer, (float*) Y.DevicePointer, N);
-        }
-
-        [DllImport("hybridizer_cuda_reduction.dll", CallingConvention = CallingConvention.Cdecl, EntryPoint = "ScalarProd_float")]
-        public extern static float inner_scalar_prod(float* X, float* Y, int N);
-
         // res = A - m*v
         [EntryPoint]
         public static void Fmsub(FloatResidentArray res, FloatResidentArray A, SparseMatrix m, FloatResidentArray v, int N)