Skip to content

Commit

Permalink
remove dependency to native library for ConjugateGradient
Browse files Browse the repository at this point in the history
implement scalar prod in c#
  • Loading branch information
Regis Portalez committed Feb 20, 2019
1 parent e3efa70 commit bb0fd2b
Show file tree
Hide file tree
Showing 2 changed files with 48 additions and 24 deletions.
Original file line number Diff line number Diff line change
@@ -1,4 +1,4 @@
<?xml version="1.0" encoding="utf-8"?>
<?xml version="1.0" encoding="utf-8"?>
<Project ToolsVersion="4.0" DefaultTargets="Build" xmlns="http://schemas.microsoft.com/developer/msbuild/2003">
<Import Project="$(MSBuildExtensionsPath)\$(MSBuildToolsVersion)\Microsoft.Common.props" Condition="Exists('$(MSBuildExtensionsPath)\$(MSBuildToolsVersion)\Microsoft.Common.props')" />
<PropertyGroup>
Expand Down Expand Up @@ -54,12 +54,6 @@
<ItemGroup>
<None Include="App.config" />
</ItemGroup>
<ItemGroup>
<Content Include="..\..\..\x64\$(Configuration)\hybridizer_cuda_reduction.dll">
<Link>hybridizer_cuda_reduction.dll</Link>
<CopyToOutputDirectory>PreserveNewest</CopyToOutputDirectory>
</Content>
</ItemGroup>
<Import Project="$(MSBuildToolsPath)\Microsoft.CSharp.targets" />
<!-- To modify your build process, add your task inside one of the targets below and uncomment it.
Other similar extension points exist, see Microsoft.Common.targets.
Expand All @@ -68,4 +62,4 @@
<Target Name="AfterBuild">
</Target>
-->
</Project>
</Project>
Original file line number Diff line number Diff line change
Expand Up @@ -16,28 +16,29 @@ static void Main(string[] args)
// configure CUDA
cudaDeviceProp prop;
cuda.GetDeviceProperties(out prop, 0);
runner = HybRunner.Cuda().SetDistrib(prop.multiProcessorCount * 16, 128);
runner = HybRunner.Cuda().SetDistrib(prop.multiProcessorCount * 16, 1, 128, 1, 1, 128 * sizeof(float));
wrapper = runner.Wrap(new Program());

int size = 1000000; // very slow convergence with no preconditioner
int size = 10000; // very slow convergence (or even not at all) with no preconditioner
SparseMatrix A = SparseMatrix.Laplacian_1D(size);
FloatResidentArray B = new FloatResidentArray(size);
FloatResidentArray X = new FloatResidentArray(size);

int maxiter = 1000;
int maxiter = 1000000;
float eps = 1.0e-09f;

for (int i = 0; i < size; ++i)
{
B[i] = 1.0f; // right side
X[i] = 0.0f; // starting point
}

ConjugateGradient(X, A, B, maxiter, eps);
}

public static void ConjugateGradient(FloatResidentArray X, SparseMatrix A, FloatResidentArray B, int maxiter, float eps)
{
float[] scalBuf = new float[1] { 0 };
int N = (int) B.Count;
FloatResidentArray R = new FloatResidentArray(N);
FloatResidentArray P = new FloatResidentArray(N);
Expand All @@ -52,11 +53,13 @@ public static void ConjugateGradient(FloatResidentArray X, SparseMatrix A, Float
while(k < maxiter)
{
wrapper.Multiply(AP, A, P, N); // AP = A*P
float r = ScalarProd(R, R, N); // save <R|R>
float alpha = r / ScalarProd(P, AP, N); // alpha = <R|R> / <P|AP>
scalBuf[0] = 0; wrapper.ScalarProd(scalBuf, R, R, N); float r = scalBuf[0]; // save <R|R>
scalBuf[0] = 0; wrapper.ScalarProd(scalBuf, P, AP, N); float alpha = r / scalBuf[0]; // alpha = <R|R> / <P|AP>
wrapper.Saxpy(X, X, alpha, P, N); // X = X - alpha*P
wrapper.Saxpy(R, R, -alpha, AP, N); // RR = R-alpha*AP
float rr = ScalarProd(R, R, N);
scalBuf[0] = 0; wrapper.ScalarProd(scalBuf, R, R, N); float rr = scalBuf[0];
if (k % 10 == 0)
Console.WriteLine(rr);
if(rr < eps*eps)
{
break;
Expand All @@ -70,6 +73,42 @@ public static void ConjugateGradient(FloatResidentArray X, SparseMatrix A, Float
X.RefreshHost();
}

[EntryPoint]
private static void ScalarProd(float[] result, FloatResidentArray r1, FloatResidentArray r2, int N)
{
var cache = new SharedMemoryAllocator<float>().allocate(blockDim.x);
int tid = threadIdx.x + blockDim.x * blockIdx.x;
int cacheIndex = threadIdx.x;

float tmp = 0.0F;
while (tid < N)
{
tmp += r1[tid] * r2[tid];
tid += blockDim.x * gridDim.x;
}

cache[cacheIndex] = tmp;

CUDAIntrinsics.__syncthreads();

int i = blockDim.x / 2;
while (i != 0)
{
if (cacheIndex < i)
{
cache[cacheIndex] += cache[cacheIndex + i];
}

CUDAIntrinsics.__syncthreads();
i >>= 1;
}

if (cacheIndex == 0)
{
AtomicExpr.apply(ref result[0], cache[0], (x, y) => x + y);
}
}

[EntryPoint]
public static void Copy(FloatResidentArray res, FloatResidentArray src, int N)
{
Expand All @@ -88,15 +127,6 @@ public static void Saxpy(FloatResidentArray res, FloatResidentArray x, float alp
});
}


public static float ScalarProd(FloatResidentArray X, FloatResidentArray Y, int N)
{
return inner_scalar_prod((float*) X.DevicePointer, (float*) Y.DevicePointer, N);
}

[DllImport("hybridizer_cuda_reduction.dll", CallingConvention = CallingConvention.Cdecl, EntryPoint = "ScalarProd_float")]
public extern static float inner_scalar_prod(float* X, float* Y, int N);

// res = A - m*v
[EntryPoint]
public static void Fmsub(FloatResidentArray res, FloatResidentArray A, SparseMatrix m, FloatResidentArray v, int N)
Expand Down

0 comments on commit bb0fd2b

Please sign in to comment.