Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

Submission: Tongbo Sui #2

Open
wants to merge 8 commits into
base: master
Choose a base branch
from
Open
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
414 changes: 206 additions & 208 deletions README.md

Large diffs are not rendered by default.

Binary file added images/all-performance.jpg
Loading
Sorry, something went wrong. Reload?
Sorry, we cannot display this file.
Sorry, this file is invalid so it cannot be displayed.
Binary file added images/custom-only-performance.jpg
Loading
Sorry, something went wrong. Reload?
Sorry, we cannot display this file.
Sorry, this file is invalid so it cannot be displayed.
Binary file added images/no-cpu-performance.jpg
Loading
Sorry, something went wrong. Reload?
Sorry, we cannot display this file.
Sorry, this file is invalid so it cannot be displayed.
65 changes: 61 additions & 4 deletions src/main.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -11,7 +11,11 @@
#include <stream_compaction/naive.h>
#include <stream_compaction/efficient.h>
#include <stream_compaction/thrust.h>
#include <stream_compaction/radix.h>
#include <algorithm> // std::sort
#include <vector> // std::vector
#include "testing_helpers.hpp"
#include <chrono>

int main(int argc, char* argv[]) {
const int SIZE = 1 << 8;
Expand Down Expand Up @@ -39,7 +43,7 @@ int main(int argc, char* argv[]) {
StreamCompaction::CPU::scan(NPOT, c, a);
printArray(NPOT, b, true);
printCmpResult(NPOT, b, c);

zeroArray(SIZE, c);
printDesc("naive scan, power-of-two");
StreamCompaction::Naive::scan(SIZE, c, a);
Expand All @@ -63,7 +67,7 @@ int main(int argc, char* argv[]) {
StreamCompaction::Efficient::scan(NPOT, c, a);
//printArray(NPOT, c, true);
printCmpResult(NPOT, b, c);

zeroArray(SIZE, c);
printDesc("thrust scan, power-of-two");
StreamCompaction::Thrust::scan(SIZE, c, a);
Expand All @@ -82,13 +86,11 @@ int main(int argc, char* argv[]) {
printf("*****************************\n");

// Compaction tests

genArray(SIZE - 1, a, 4); // Leave a 0 at the end to test that edge case
a[SIZE - 1] = 0;
printArray(SIZE, a, true);

int count, expectedCount, expectedNPOT;

zeroArray(SIZE, b);
printDesc("cpu compact without scan, power-of-two");
count = StreamCompaction::CPU::compactWithoutScan(SIZE, b, a);
Expand Down Expand Up @@ -120,4 +122,59 @@ int main(int argc, char* argv[]) {
count = StreamCompaction::Efficient::compact(NPOT, c, a);
//printArray(count, c, true);
printCmpLenResult(count, expectedNPOT, b, c);

printf("\n");
printf("*****************************\n");
printf("** RADIX SORT (Single block/tile) TEST **\n");
printf("*****************************\n");

zeroArray(SIZE, c);
count = RadixSort::sort(SIZE, c, a, 8);

std::vector<int> s(a, a + SIZE);
std::sort(s.begin(), s.end());

int d[SIZE];
for (int i = 0; i < SIZE; i++){
d[i] = s[i];
}

printf("Radix sort:\n");
printArray(count, c, true);
printf("Std sort:\n");
printArray(count, d, true);
printCmpLenResult(count, count, d, c);

printf("\n");
printf("*****************************\n");
printf("** SCAN PERFORMANCE **\n");
printf("*****************************\n");

for (int s = 4; s < 19; s++){
int ssize = 1 << s;
int *u, *v;
u = (int *)malloc(ssize*sizeof(int));
v = (int *)malloc(ssize*sizeof(int));
printf("==== Array size: %d ====\n", ssize);
genArray(ssize, u, 50);

zeroArray(ssize, v);
auto start = std::chrono::high_resolution_clock::now();
StreamCompaction::CPU::scan(ssize, v, u);
auto end = std::chrono::high_resolution_clock::now();
double diff = std::chrono::duration_cast<std::chrono::milliseconds>(end-start).count();
printf("CPU scan: %f\n", diff);

zeroArray(ssize, v);
StreamCompaction::Naive::scan(ssize, v, u);

zeroArray(ssize, v);
StreamCompaction::Efficient::scan(ssize, v, u);

zeroArray(ssize, v);
StreamCompaction::Thrust::scan(ssize, v, u);
free(u);
free(v);
printf("\n");
}
}
2 changes: 2 additions & 0 deletions stream_compaction/CMakeLists.txt
Original file line number Diff line number Diff line change
Expand Up @@ -9,6 +9,8 @@ set(SOURCE_FILES
"efficient.cu"
"thrust.h"
"thrust.cu"
"radix.h"
"radix.cu"
)

cuda_add_library(stream_compaction
Expand Down
51 changes: 45 additions & 6 deletions stream_compaction/cpu.cu
Original file line number Diff line number Diff line change
@@ -1,15 +1,19 @@
#include <cstdio>
#include "cpu.h"
#include <stdlib.h>

namespace StreamCompaction {
namespace CPU {

/**
* CPU scan (prefix sum).
* CPU scan (exclusive prefix sum).
*/
void scan(int n, int *odata, const int *idata) {
// TODO
printf("TODO\n");
odata[0] = 0;
for (int i = 1; i < n; i++){
odata[i] = odata[i - 1] + idata[i - 1];
}
}

/**
Expand All @@ -18,8 +22,16 @@ void scan(int n, int *odata, const int *idata) {
* @returns the number of elements remaining after compaction.
*/
int compactWithoutScan(int n, int *odata, const int *idata) {
// TODO
return -1;
int count = 0;
int j = 0;
for (int i = 0; i < n; i++){
if (idata[i] != 0){
count++;
odata[j] = idata[i];
j++;
}
}
return count;
}

/**
Expand All @@ -28,8 +40,35 @@ int compactWithoutScan(int n, int *odata, const int *idata) {
* @returns the number of elements remaining after compaction.
*/
int compactWithScan(int n, int *odata, const int *idata) {
// TODO
return -1;
int *c;
c = (int *)malloc(n * sizeof(int));
for (int i = 0; i < n; i++){
if (idata[i] != 0){
c[i] = 1;
}
else {
c[i] = 0;
}
}
int *d, *e;
d = (int *)malloc(n * sizeof(int));

scan(n, d, c);

for (int i = 0; i < n; i++){
if (c[i] == 1){
odata[d[i]] = idata[i];
}
}
free(c);
free(d);
int count = 0;
for (int i = 0; i < n; i++){
if (odata[i] != 0){
count++;
}
}
return count;
}

}
Expand Down
124 changes: 119 additions & 5 deletions stream_compaction/efficient.cu
Original file line number Diff line number Diff line change
Expand Up @@ -6,14 +6,93 @@
namespace StreamCompaction {
namespace Efficient {

// TODO: __global__
__global__ void scanUp(int d, int *idata){
int k = blockIdx.x*blockDim.x + threadIdx.x;
if (k % (int)pow((double)2, (double)(d + 1)) == 0){
idata[k - 1 + (int)pow((double)2, (double)(d + 1))] += idata[k - 1 + (int)pow((double)2, (double)d)];
}
}

__global__ void scanDown(int d, int *idata){
int k = blockIdx.x*blockDim.x + threadIdx.x;
if (k % (int)pow((double)2, (double)(d + 1)) == 0){
int t = idata[k - 1 + (int)pow((double)2, (double)d)];
idata[k - 1 + (int)pow((double)2, (double)d)] = idata[k - 1 + (int)pow((double)2, (double)(d + 1))];
idata[k - 1 + (int)pow((double)2, (double)(d + 1))] += t;
}
}

__global__ void filter(int *odata, int *idata){
int k = blockIdx.x*blockDim.x + threadIdx.x;
if (idata[k] == 0){
odata[k] = 0;
}
else {
odata[k] = 1;
}
}

__global__ void scatter(int *odata, int *idata, int *filter, int *idx){
int k = blockIdx.x*blockDim.x + threadIdx.x;
if (filter[k] == 1){
odata[idx[k]] = idata[k];
}
}

/**
* Performs prefix-sum (aka scan) on idata, storing the result into odata.
*/
void scan(int n, int *odata, const int *idata) {
// TODO
printf("TODO\n");
// Padding
int m = (int)pow((double)2, (double)ilog2ceil(n));
int *pidata;
pidata = (int*)malloc(m*sizeof(int));
for (int i = 0; i < n; i++){
pidata[i] = idata[i];
}
if (m > n){
for (int i = n; i < m; i++){
pidata[i] = 0;
}
}
int *dev_pidata;
cudaMalloc((void **)&dev_pidata, m*sizeof(int));
cudaMemcpy(dev_pidata, pidata, m*sizeof(int), cudaMemcpyHostToDevice);

cudaEvent_t start, stop;
cudaEventCreate(&start);
cudaEventCreate(&stop);

// Scan
cudaEventRecord(start);
for (int d = 0; d < ilog2ceil(m); d++){
scanUp << <m/128, 128>> >(d, dev_pidata);
}
cudaEventRecord(stop);
cudaEventSynchronize(stop);
float ms1 = 0;
cudaEventElapsedTime(&ms1, start, stop);

cudaMemcpy(pidata, dev_pidata, m*sizeof(int), cudaMemcpyDeviceToHost);
pidata[m - 1] = 0;
cudaMemcpy(dev_pidata, pidata, m*sizeof(int), cudaMemcpyHostToDevice);

cudaEventRecord(start);
for (int d = ilog2ceil(m)-1; d >=0; d--){
scanDown<<<m/128, 128>>>(d, dev_pidata);
}
cudaEventRecord(stop);
cudaEventSynchronize(stop);
float ms2 = 0;
cudaEventElapsedTime(&ms2, start, stop);
printf("Work-efficient scan: %f\n", (ms1+ms2));

cudaMemcpy(pidata, dev_pidata, m*sizeof(int), cudaMemcpyDeviceToHost);
for (int i = 0; i < n; i++){
odata[i] = pidata[i];
}
cudaFree(dev_pidata);
free(pidata);
}

/**
Expand All @@ -26,8 +105,43 @@ void scan(int n, int *odata, const int *idata) {
* @returns The number of elements remaining after compaction.
*/
int compact(int n, int *odata, const int *idata) {
// TODO
return -1;
int *f;
int *dev_idata;
cudaMalloc((void**)&f, n * sizeof(int));
cudaMalloc((void**)&dev_idata, n * sizeof(int));
cudaMemcpy(dev_idata, idata, n * sizeof(int), cudaMemcpyHostToDevice);
filter << <1, n >> >(f, dev_idata);

int *hs_idx;
int *hs_f;
hs_idx = (int *)malloc(n * sizeof(int));
hs_f = (int *)malloc(n*sizeof(int));
cudaMemcpy(hs_f, f, n * sizeof(int), cudaMemcpyDeviceToHost);
scan(n, hs_idx, hs_f);

int *idx;
int *dv_out;
cudaMalloc((void**)&idx, n * sizeof(int));
cudaMalloc((void**)&dv_out, n * sizeof(int));
cudaMemcpy(idx, hs_idx, n * sizeof(int), cudaMemcpyHostToDevice);
scatter << <1, n >> >(dv_out, dev_idata, f, idx);

cudaMemcpy(odata, dv_out, n * sizeof(int), cudaMemcpyDeviceToHost);

cudaFree(f);
cudaFree(dev_idata);
cudaFree(idx);
cudaFree(dv_out);
free(hs_idx);
free(hs_f);

int count = 0;
for (int i = 0; i < n; i++){
if (odata[i] != 0){
count++;
}
}
return count;
}

}
Expand Down
49 changes: 46 additions & 3 deletions stream_compaction/naive.cu
Original file line number Diff line number Diff line change
Expand Up @@ -2,18 +2,61 @@
#include <cuda_runtime.h>
#include "common.h"
#include "naive.h"
#include <stdlib.h>

namespace StreamCompaction {
namespace Naive {

// TODO: __global__
__global__ void scanCol(int d, int *idata){
int k = blockIdx.x*blockDim.x + threadIdx.x;
if (k >= (int)pow((double)2, (double)(d-1))){
idata[k] = idata[k - (int)pow((double)2, (double)(d - 1))] + idata[k];
}
}

/**
* Performs prefix-sum (aka scan) on idata, storing the result into odata.
*/
void scan(int n, int *odata, const int *idata) {
// TODO
printf("TODO\n");
// Padding
int m = (int)pow((double)2, (double)ilog2ceil(n));
int *pidata;
pidata = (int*)malloc(m*sizeof(int));
for (int i = 0; i < n; i++){
pidata[i] = idata[i];
}
if (m > n){
for (int i = n; i < m; i++){
pidata[i] = 0;
}
}
int *dev_pidata;
cudaMalloc((void **)&dev_pidata, m*sizeof(int));
cudaMemcpy(dev_pidata, pidata, m*sizeof(int), cudaMemcpyHostToDevice);

cudaEvent_t start, stop;
cudaEventCreate(&start);
cudaEventCreate(&stop);

// Scan
cudaEventRecord(start);
for (int d = 1; d <= ilog2ceil(m); d++){
scanCol<<<m/64, 64>>>(d, dev_pidata);
}
cudaEventRecord(stop);
cudaEventSynchronize(stop);

float msAdd = 0;
cudaEventElapsedTime(&msAdd, start, stop);
printf("Naive scan: %f\n", msAdd);

cudaMemcpy(pidata, dev_pidata, m*sizeof(int), cudaMemcpyDeviceToHost);
odata[0] = 0;
for (int i = 1; i < n; i++){
odata[i] = pidata[i-1];
}
cudaFree(dev_pidata);
free(pidata);
}

}
Expand Down
Loading