diff --git a/spmvm/spmvm.cuh b/spmvm/spmvm.cuh index de8c7d7..b2b6c0a 100644 --- a/spmvm/spmvm.cuh +++ b/spmvm/spmvm.cuh @@ -40,6 +40,7 @@ struct spmvm_context_t scalar_t *d_scalars; // output scalars scalar_t *d_out; + }; template @@ -84,10 +85,14 @@ struct spmvm_host_t const scalar_t *data; const size_t *col_idx; const size_t *row_ptr; + const size_t *blocks; size_t num_rows; size_t num_cols; size_t nnz; + + size_t num_blocks; + size_t block_size; }; template @@ -115,21 +120,21 @@ public: // scalar_t *d_out; public: - spmvm_t(size_t num_rows, size_t num_cols, size_t nnz, int device_id = -1) + spmvm_t(spmvm_host_t *csr, int device_id = -1) : gpu(select_gpu(device_id)) { this->context = reinterpret_cast *>(malloc(sizeof(spmvm_context_t))); - context->d_data = reinterpret_cast(gpu.Dmalloc(nnz * sizeof(scalar_t))); - context->d_col_idx = reinterpret_cast(gpu.Dmalloc(nnz * sizeof(size_t))); - context->d_row_ptr = reinterpret_cast(gpu.Dmalloc((num_rows + 1) * sizeof(size_t))); + context->d_data = reinterpret_cast(gpu.Dmalloc(csr->block_size * sizeof(scalar_t))); + context->d_col_idx = reinterpret_cast(gpu.Dmalloc(csr->block_size * sizeof(size_t))); + context->d_row_ptr = reinterpret_cast(gpu.Dmalloc((csr->num_rows + 1) * sizeof(size_t))); - context->num_rows = num_rows; - context->num_cols = num_cols; - context->nnz = nnz; + context->num_rows = csr->num_rows; + context->num_cols = csr->num_cols; + context->nnz = csr->nnz; - context->d_scalars = reinterpret_cast(gpu.Dmalloc(num_cols * sizeof(scalar_t))); - context->d_out = reinterpret_cast(gpu.Dmalloc(num_rows * sizeof(scalar_t))); + context->d_scalars = reinterpret_cast(gpu.Dmalloc(csr->num_cols * sizeof(scalar_t))); + context->d_out = reinterpret_cast(gpu.Dmalloc(csr->num_rows * sizeof(scalar_t))); this->owned = true; } @@ -202,21 +207,49 @@ public: try { - if (csr->data) - gpu[2].HtoD(&context->d_data[0], &csr->data[0], context->nnz); - if (csr->col_idx) - gpu[2].HtoD(&context->d_col_idx[0], &csr->col_idx[0], context->nnz); - if (csr->row_ptr) - gpu[2].HtoD(&context->d_row_ptr[0], &csr->row_ptr[0], context->num_rows + 1); - if (scalars) gpu[2].HtoD(&context->d_scalars[0], &scalars[0], context->num_cols); - - spmvm_context_t *d_context = reinterpret_cast *>(gpu[2].Dmalloc(sizeof(spmvm_context_t))); - gpu[2].HtoD(d_context, context, 1); cudaMemsetAsync(&context->d_out[0], 0, context->num_rows * sizeof(scalar_t), gpu[2]); - csr_vector_mul<<>>(d_context); - CUDA_OK(cudaGetLastError()); + + size_t start_row = 0; + size_t end_row = 0; + size_t num_rows = 0; + + size_t start_data = 0; + size_t end_data = 0; + size_t num_data = 0; + + for (size_t i = 0; i < csr->num_blocks - 1; ++i) { + start_row = csr->blocks[i]; + end_row = csr->blocks[i + 1]; + num_rows = end_row - start_row; + + start_data = csr->row_ptr[start_row]; + end_data = csr->row_ptr[end_row]; + num_data = end_data - start_data; + + if (csr->data) + gpu[i&1].HtoD(&context->d_data[0], &csr->data[start_data], num_data); + CUDA_OK(cudaGetLastError()); + if (csr->col_idx) + gpu[i&1].HtoD(&context->d_col_idx[0], &csr->col_idx[start_data], num_data); + CUDA_OK(cudaGetLastError()); + if (csr->row_ptr) { + gpu[i&1].HtoD(&context->d_row_ptr[0], &csr->row_ptr[start_row], num_rows + 1); + csr->num_rows = num_rows; + } + CUDA_OK(cudaGetLastError()); + + gpu[i&1].sync(); + + spmvm_context_t *d_context = reinterpret_cast *>(gpu[i&1].Dmalloc(sizeof(spmvm_context_t))); + gpu[i&1].HtoD(d_context, context, 1); + + csr_vector_mul<<>>(d_context); + CUDA_OK(cudaGetLastError()); + + gpu[i&1].sync(); + } gpu[2].DtoH(&out[0], &context->d_out[0], context->num_rows); gpu.sync(); @@ -240,10 +273,7 @@ static RustError sparse_matrix_mul(spmvm_host_t *csr, const scalar_t * { try { - size_t num_rows = csr->num_rows; - size_t num_cols = csr->num_cols; - size_t nnz = csr->nnz; - spmvm_t spmvm{num_rows, num_cols, nnz}; + spmvm_t spmvm{csr}; return spmvm.invoke(csr, scalars, out, nthreads); } catch (const cuda_error &e)