diff --git a/README.md b/README.md index a82ea0f..87a6183 100644 --- a/README.md +++ b/README.md @@ -3,38 +3,16 @@ CUDA Stream Compaction **University of Pennsylvania, CIS 565: GPU Programming and Architecture, Project 2** -* (TODO) YOUR NAME HERE -* Tested on: (TODO) Windows 22, i7-2222 @ 2.22GHz 22GB, GTX 222 222MB (Moore 2222 Lab) +* Sally Kong +* Tested on: Windows 8, i7-5500U CPU @ 2.40GHz 2.40 GHz, GEForce 920M (Personal) -### (TODO: Your README) -Include analysis, etc. (Remember, this is public, so don't put -anything here that you don't want to share with the world.) +**Summary:** This project is an implementation of a GPU stream compaction in CUDA, +from scratch. This is a widely used algorithm that I later plan to use to accelerate my path tracer. -Instructions (delete me) -======================== - -This is due Sunday, September 13 at midnight. - -**Summary:** In this project, you'll implement GPU stream compaction in CUDA, -from scratch. This algorithm is widely used, and will be important for -accelerating your path tracer project. - -Your stream compaction implementations in this project will simply remove `0`s -from an array of `int`s. In the path tracer, you will remove terminated paths -from an array of rays. - -In addition to being useful for your path tracer, this project is meant to -reorient your algorithmic thinking to the way of the GPU. On GPUs, many -algorithms can benefit from massive parallelism and, in particular, data -parallelism: executing the same code many times simultaneously with different -data. - -You'll implement a few different versions of the *Scan* (*Prefix Sum*) -algorithm. First, you'll implement a CPU version of the algorithm to reinforce -your understanding. Then, you'll write a few GPU implementations: "naive" and -"work-efficient." Finally, you'll use some of these to implement GPU stream -compaction. +A few different versions of the *Scan* (*Prefix Sum*) +algorithm were implemented: a CPU version, and a few GPU implementations: "naive" and +"work-efficient." **Algorithm overview & details:** There are two primary references for details on the implementation of scan and stream compaction. @@ -43,171 +21,7 @@ on the implementation of scan and stream compaction. for Scan, Stream Compaction, and Work-Efficient Parallel Scan. * GPU Gems 3, Chapter 39 - [Parallel Prefix Sum (Scan) with CUDA](http://http.developer.nvidia.com/GPUGems3/gpugems3_ch39.html). -Your GPU stream compaction implementation will live inside of the -`stream_compaction` subproject. This way, you will be able to easily copy it -over for use in your GPU path tracer. - - -## Part 0: The Usual - -This project (and all other CUDA projects in this course) requires an NVIDIA -graphics card with CUDA capability. Any card with Compute Capability 2.0 -(`sm_20`) or greater will work. Check your GPU on this -[compatibility table](https://developer.nvidia.com/cuda-gpus). -If you do not have a personal machine with these specs, you may use those -computers in the Moore 100B/C which have supported GPUs. - -**HOWEVER**: If you need to use the lab computer for your development, you will -not presently be able to do GPU performance profiling. This will be very -important for debugging performance bottlenecks in your program. - -### Useful existing code - -* `stream_compaction/common.h` - * `checkCUDAError` macro: checks for CUDA errors and exits if there were any. - * `ilog2ceil(x)`: computes the ceiling of log2(x), as an integer. -* `main.cpp` - * Some testing code for your implementations. - - -## Part 1: CPU Scan & Stream Compaction - -This stream compaction method will remove `0`s from an array of `int`s. - -In `stream_compaction/cpu.cu`, implement: - -* `StreamCompaction::CPU::scan`: compute an exclusive prefix sum. -* `StreamCompaction::CPU::compactWithoutScan`: stream compaction without using - the `scan` function. -* `StreamCompaction::CPU::compactWithScan`: stream compaction using the `scan` - function. Map the input array to an array of 0s and 1s, scan it, and use - scatter to produce the output. You will need a **CPU** scatter implementation - for this (see slides or GPU Gems chapter for an explanation). - -These implementations should only be a few lines long. - - -## Part 2: Naive GPU Scan Algorithm - -In `stream_compaction/naive.cu`, implement `StreamCompaction::Naive::scan` - -This uses the "Naive" algorithm from GPU Gems 3, Section 39.2.1. We haven't yet -taught shared memory, and you **shouldn't use it yet**. Example 39-1 uses -shared memory, but is limited to operating on very small arrays! Instead, write -this using global memory only. As a result of this, you will have to do -`ilog2ceil(n)` separate kernel invocations. - -Beware of errors in Example 39-1 in the book; both the pseudocode and the CUDA -code in the online version of Chapter 39 are known to have a few small errors -(in superscripting, missing braces, bad indentation, etc.) - -Since the parallel scan algorithm operates on a binary tree structure, it works -best with arrays with power-of-two length. Make sure your implementation works -on non-power-of-two sized arrays (see `ilog2ceil`). This requires extra memory -- your intermediate array sizes will need to be rounded to the next power of -two. - - -## Part 3: Work-Efficient GPU Scan & Stream Compaction - -### 3.1. Scan - -In `stream_compaction/efficient.cu`, implement -`StreamCompaction::Efficient::scan` - -All of the text in Part 2 applies. - -* This uses the "Work-Efficient" algorithm from GPU Gems 3, Section 39.2.2. -* Beware of errors in Example 39-2. -* Test non-power-of-two sized arrays. - -### 3.2. Stream Compaction - -This stream compaction method will remove `0`s from an array of `int`s. - -In `stream_compaction/efficient.cu`, implement -`StreamCompaction::Efficient::compact` - -For compaction, you will also need to implement the scatter algorithm presented -in the slides and the GPU Gems chapter. - -In `stream_compaction/common.cu`, implement these for use in `compact`: - -* `StreamCompaction::Common::kernMapToBoolean` -* `StreamCompaction::Common::kernScatter` - - -## Part 4: Using Thrust's Implementation - -In `stream_compaction/thrust.cu`, implement: - -* `StreamCompaction::Thrust::scan` - -This should be a very short function which wraps a call to the Thrust library -function `thrust::exclusive_scan(first, last, result)`. - -To measure timing, be sure to exclude memory operations by passing -`exclusive_scan` a `thrust::device_vector` (which is already allocated on the -GPU). You can create a `thrust::device_vector` by creating a -`thrust::host_vector` from the given pointer, then casting it. - - -## Part 5: Radix Sort (Extra Credit) (+10) - -Add an additional module to the `stream_compaction` subproject. Implement radix -sort using one of your scan implementations. Add tests to check its correctness. - - -## Write-up - -1. Update all of the TODOs at the top of this README. -2. Add a description of this project including a list of its features. -3. Add your performance analysis (see below). - -All extra credit features must be documented in your README, explaining its -value (with performance comparison, if applicable!) and showing an example how -it works. For radix sort, show how it is called and an example of its output. - -Always profile with Release mode builds and run without debugging. - -### Questions - -* Roughly optimize the block sizes of each of your implementations for minimal - run time on your GPU. - * (You shouldn't compare unoptimized implementations to each other!) - -* Compare all of these GPU Scan implementations (Naive, Work-Efficient, and - Thrust) to the serial CPU version of Scan. Plot a graph of the comparison - (with array size on the independent axis). - * You should use CUDA events for timing. Be sure **not** to include any - explicit memory operations in your performance measurements, for - comparability. - * To guess at what might be happening inside the Thrust implementation, take - a look at the Nsight timeline for its execution. - -* Write a brief explanation of the phenomena you see here. - * Can you find the performance bottlenecks? Is it memory I/O? Computation? Is - it different for each implementation? - -* Paste the output of the test program into a triple-backtick block in your - README. - * If you add your own tests (e.g. for radix sort or to test additional corner - cases), be sure to mention it explicitly. - -These questions should help guide you in performance analysis on future -assignments, as well. - -## Submit +## Performance Analysis -If you have modified any of the `CMakeLists.txt` files at all (aside from the -list of `SOURCE_FILES`), you must test that your project can build in Moore -100B/C. Beware of any build issues discussed on the Google Group. +![](imgs/graph.png) -1. Open a GitHub pull request so that we can see that you have finished. - The title should be "Submission: YOUR NAME". -2. Send an email to the TA (gmail: kainino1+cis565@) with: - * **Subject**: in the form of `[CIS565] Project 2: PENNKEY` - * Direct link to your pull request on GitHub - * In the form of a grade (0-100+) with comments, evaluate your own - performance on the project. - * Feedback on the project itself, if any. diff --git a/imgs/graph.png b/imgs/graph.png new file mode 100644 index 0000000..b095e47 Binary files /dev/null and b/imgs/graph.png differ diff --git a/src/main.cpp b/src/main.cpp index 675da35..3d70dd8 100644 --- a/src/main.cpp +++ b/src/main.cpp @@ -38,42 +38,43 @@ int main(int argc, char* argv[]) { printDesc("cpu scan, non-power-of-two"); StreamCompaction::CPU::scan(NPOT, c, a); printArray(NPOT, b, true); + printArray(NPOT, c, true); printCmpResult(NPOT, b, c); zeroArray(SIZE, c); - printDesc("naive scan, power-of-two"); + printDesc("naive scan, power-of-two"); StreamCompaction::Naive::scan(SIZE, c, a); - //printArray(SIZE, c, true); + printArray(SIZE, c, true); printCmpResult(SIZE, b, c); zeroArray(SIZE, c); printDesc("naive scan, non-power-of-two"); StreamCompaction::Naive::scan(NPOT, c, a); - //printArray(SIZE, c, true); + printArray(SIZE, c, true); printCmpResult(NPOT, b, c); zeroArray(SIZE, c); printDesc("work-efficient scan, power-of-two"); StreamCompaction::Efficient::scan(SIZE, c, a); - //printArray(SIZE, c, true); + printArray(SIZE, c, true); printCmpResult(SIZE, b, c); zeroArray(SIZE, c); printDesc("work-efficient scan, non-power-of-two"); StreamCompaction::Efficient::scan(NPOT, c, a); - //printArray(NPOT, c, true); + printArray(NPOT, c, true); printCmpResult(NPOT, b, c); zeroArray(SIZE, c); printDesc("thrust scan, power-of-two"); StreamCompaction::Thrust::scan(SIZE, c, a); - //printArray(SIZE, c, true); + printArray(SIZE, c, true); printCmpResult(SIZE, b, c); zeroArray(SIZE, c); printDesc("thrust scan, non-power-of-two"); StreamCompaction::Thrust::scan(NPOT, c, a); - //printArray(NPOT, c, true); + printArray(NPOT, c, true); printCmpResult(NPOT, b, c); printf("\n"); @@ -112,12 +113,13 @@ int main(int argc, char* argv[]) { zeroArray(SIZE, c); printDesc("work-efficient compact, power-of-two"); count = StreamCompaction::Efficient::compact(SIZE, c, a); - //printArray(count, c, true); + printArray(count, c, true); printCmpLenResult(count, expectedCount, b, c); zeroArray(SIZE, c); printDesc("work-efficient compact, non-power-of-two"); count = StreamCompaction::Efficient::compact(NPOT, c, a); - //printArray(count, c, true); + printArray(count, c, true); printCmpLenResult(count, expectedNPOT, b, c); + system("pause"); } diff --git a/stream_compaction/common.cu b/stream_compaction/common.cu index fe872d4..1012fcb 100644 --- a/stream_compaction/common.cu +++ b/stream_compaction/common.cu @@ -23,7 +23,14 @@ namespace Common { * which map to 0 will be removed, and elements which map to 1 will be kept. */ __global__ void kernMapToBoolean(int n, int *bools, const int *idata) { - // TODO + int index = (blockIdx.x * blockDim.x) + threadIdx.x; + if (index < n) { + if (idata[index] != 0) { + bools[index] = 1; + } else { + bools[index] = 0; + } + } } /** @@ -32,7 +39,12 @@ __global__ void kernMapToBoolean(int n, int *bools, const int *idata) { */ __global__ void kernScatter(int n, int *odata, const int *idata, const int *bools, const int *indices) { - // TODO + int index = (blockIdx.x * blockDim.x) + threadIdx.x; + if (index < n) { + if (bools[index] ==1) { + odata[indices[index]] = idata[index]; + } + } } } diff --git a/stream_compaction/cpu.cu b/stream_compaction/cpu.cu index e600c29..ac5d74f 100644 --- a/stream_compaction/cpu.cu +++ b/stream_compaction/cpu.cu @@ -8,8 +8,10 @@ namespace CPU { * CPU scan (prefix sum). */ void scan(int n, int *odata, const int *idata) { - // TODO - printf("TODO\n"); + odata[0] = 0; + for (int i = 1; i < n; i++) { + odata[i] = idata[i-1] + odata[i-1]; + } } /** @@ -18,8 +20,16 @@ void scan(int n, int *odata, const int *idata) { * @returns the number of elements remaining after compaction. */ int compactWithoutScan(int n, int *odata, const int *idata) { - // TODO - return -1; + int cnt = 0; + for(int i = 0; i < n; i++) { + if (idata[i] != 0) { + cnt++; + odata[i] = 1; + } else { + odata[i] = 0; + } + } + return cnt; } /** @@ -28,8 +38,16 @@ int compactWithoutScan(int n, int *odata, const int *idata) { * @returns the number of elements remaining after compaction. */ int compactWithScan(int n, int *odata, const int *idata) { - // TODO - return -1; + for(int i = 0; i < n; i++) { + if (idata[i] != 0) { + odata[i] = 1; + } else { + odata[i] = 0; + } + } + int* result = new int[n]; + scan(n, result, odata); + return result[n-1]; } } diff --git a/stream_compaction/efficient.cu b/stream_compaction/efficient.cu index b2f739b..68d21bd 100644 --- a/stream_compaction/efficient.cu +++ b/stream_compaction/efficient.cu @@ -6,16 +6,77 @@ namespace StreamCompaction { namespace Efficient { -// TODO: __global__ +#define blockSize 1024 +int *temp_scan; +int *scan_result; + +__global__ void upSweep(int n, int d, int *o_data, int *i_data) { + int index = (blockIdx.x * blockDim.x) + threadIdx.x; + if (index <= n) { + if (index % (int)pow(2.0, d+1) == 0) { + o_data[index-1] = i_data[index - 1 - (int)pow(2.0, d)] + i_data[index - 1]; + } + } +} + +__global__ void downSweep(int n, int d, int *o_data, int *i_data) { + int index = (blockIdx.x * blockDim.x) + threadIdx.x; + int temp = 0; + if (index <= n) { + if (index % (int)pow(2.0, d+1) == 0) { + temp = i_data[index - 1 - (int)pow(2.0, d)]; + o_data[index - 1 - (int)pow(2.0, d)] = i_data[index-1]; + o_data[index-1] = temp + i_data[index - 1]; + } + } + +} -/** - * Performs prefix-sum (aka scan) on idata, storing the result into odata. - */ void scan(int n, int *odata, const int *idata) { - // TODO - printf("TODO\n"); + int d = ilog2ceil(n); + int total = (int) pow(2.0, d); + + cudaMalloc((void**)&scan_result, total * sizeof(int)); + cudaMalloc((void**)&temp_scan, total * sizeof(int)); + + cudaMemcpy(temp_scan, idata, total * sizeof(int), cudaMemcpyHostToDevice); + cudaMemcpy(scan_result, idata, total * sizeof(int), cudaMemcpyHostToDevice); + + dim3 fullBlocksPerGrid((total + blockSize - 1) / blockSize); + + cudaEvent_t start, stop; + cudaEventCreate(&start); + cudaEventCreate(&stop); + + cudaEventRecord(start); + for (int i = 0; i < d; i++) { + upSweep<<>>(total, i, scan_result, temp_scan); + temp_scan = scan_result; + } + + + cudaMemcpy(odata, scan_result, total * sizeof(int), cudaMemcpyDeviceToHost); + odata[total-1] = 0; + + cudaMemcpy(scan_result, odata, total * sizeof(int), cudaMemcpyHostToDevice); + cudaMemcpy(temp_scan, odata, total * sizeof(int), cudaMemcpyHostToDevice); + + for (int i = d-1; i >= 0; i--) { + downSweep<<>>(total, i, scan_result, temp_scan); + temp_scan = scan_result; + } + + cudaEventRecord(stop); + cudaEventSynchronize(stop); + float milliseconds = 0; + cudaEventElapsedTime(&milliseconds, start, stop); + printf("%f milliseconds for efficient \n", milliseconds); + + cudaMemcpy(odata, scan_result, total * sizeof(int), cudaMemcpyDeviceToHost); + printf("odata[n-1] %d \n", odata[total-1]); } + /** * Performs stream compaction on idata, storing the result into odata. * All zeroes are discarded. @@ -26,8 +87,32 @@ void scan(int n, int *odata, const int *idata) { * @returns The number of elements remaining after compaction. */ int compact(int n, int *odata, const int *idata) { - // TODO - return -1; + + int d = ilog2ceil(n); + int total = (int) pow(2.0, d); + + int *predicate_array; + int *hst_predicate_array; + int *dev_idata; + int *compact_array; + + cudaMalloc((void**)&predicate_array, total * sizeof(int)); + cudaMalloc((void**)&hst_predicate_array, total * sizeof(int)); + cudaMalloc((void**)&dev_idata, total * sizeof(int)); + cudaMalloc((void**)&compact_array, total * sizeof(int)); + + cudaMemcpy(dev_idata, idata, total * sizeof(int), cudaMemcpyHostToDevice); + + dim3 fullBlocksPerGrid((total + blockSize - 1) / blockSize); + + Common::kernMapToBoolean<<>>(total, predicate_array, dev_idata); + cudaMemcpy(hst_predicate_array, predicate_array, total * sizeof(int), cudaMemcpyDeviceToHost); + + scan(total, odata, hst_predicate_array); + int totalAfterCompaction = odata[total-1]; + cudaMemcpy(odata, predicate_array, total * sizeof(int), cudaMemcpyDeviceToHost); + + return totalAfterCompaction; } } diff --git a/stream_compaction/naive.cu b/stream_compaction/naive.cu index 3d86b60..535007f 100644 --- a/stream_compaction/naive.cu +++ b/stream_compaction/naive.cu @@ -1,3 +1,5 @@ +#define GLM_FORCE_CUDA +#include #include #include #include "common.h" @@ -6,14 +8,72 @@ namespace StreamCompaction { namespace Naive { +#define blockSize 128 +int *scan_result; +int *temp_scan; +int *shifted_result; + // TODO: __global__ +__global__ void prefixSum(int n, int d, int *o_data, int *i_data) { + int index = (blockIdx.x * blockDim.x) + threadIdx.x; + + if (index < n) { + if (index >= (int)pow(2.0, d-1)) { + o_data[index] = i_data[index - (int)pow(2.0, d-1)] + i_data[index]; + } + } +} + + + /** * Performs prefix-sum (aka scan) on idata, storing the result into odata. */ void scan(int n, int *odata, const int *idata) { - // TODO - printf("TODO\n"); + + int d = ilog2ceil(n); + + cudaMalloc((void**)&scan_result, n * sizeof(int)); + cudaMalloc((void**)&temp_scan, n * sizeof(int)); + cudaMalloc((void**)&shifted_result, n * sizeof(int)); + + cudaMemcpy(temp_scan, idata, n * sizeof(int), cudaMemcpyHostToDevice); + cudaMemcpy(scan_result, idata, n * sizeof(int), cudaMemcpyHostToDevice); + + dim3 fullBlocksPerGrid((n + blockSize - 1) / blockSize); + + cudaEvent_t start, stop; + cudaEventCreate(&start); + cudaEventCreate(&stop); + + cudaEventRecord(start); + for (int i = 1; i <= d; i++) { + prefixSum<<>>(n, i, scan_result, temp_scan); + temp_scan = scan_result; + } + cudaEventRecord(stop); + + cudaMemcpy(odata, scan_result, n * sizeof(int), cudaMemcpyDeviceToHost); + + //shift right + for(int i = n-1; i >= 0; i--) { + odata[i] = odata[i-1]; + } + odata[0] = 0; + + cudaEventSynchronize(stop); + float milliseconds = 0; + cudaEventElapsedTime(&milliseconds, start, stop); + printf("%f milliseconds for naive \n", milliseconds); + + cleanUp(); + +} + +void cleanUp() { + cudaFree(scan_result); + cudaFree(temp_scan); } } diff --git a/stream_compaction/naive.h b/stream_compaction/naive.h index 21152d6..1e28232 100644 --- a/stream_compaction/naive.h +++ b/stream_compaction/naive.h @@ -3,5 +3,6 @@ namespace StreamCompaction { namespace Naive { void scan(int n, int *odata, const int *idata); + void cleanUp(); } } diff --git a/stream_compaction/thrust.cu b/stream_compaction/thrust.cu index d8dbb32..c23549d 100644 --- a/stream_compaction/thrust.cu +++ b/stream_compaction/thrust.cu @@ -15,7 +15,25 @@ namespace Thrust { void scan(int n, int *odata, const int *idata) { // TODO use `thrust::exclusive_scan` // example: for device_vectors dv_in and dv_out: - // thrust::exclusive_scan(dv_in.begin(), dv_in.end(), dv_out.begin()); + + cudaEvent_t start, stop; + cudaEventCreate(&start); + cudaEventCreate(&stop); + + thrust::host_vector hst_in(idata, idata + n); + thrust::device_vector dv_in(hst_in); + thrust::device_vector dv_out(n); + + cudaEventRecord(start); + thrust::exclusive_scan(dv_in.begin(), dv_in.end(), dv_out.begin()); + cudaEventRecord(stop); + + cudaEventSynchronize(stop); + float milliseconds = 0; + cudaEventElapsedTime(&milliseconds, start, stop); + printf("%f milliseconds for thrust \n", milliseconds); + + thrust::copy(dv_out.begin(), dv_out.end(), odata); } }