diff --git a/README.md b/README.md index a82ea0f..da64423 100644 --- a/README.md +++ b/README.md @@ -3,211 +3,209 @@ CUDA Stream Compaction **University of Pennsylvania, CIS 565: GPU Programming and Architecture, Project 2** -* (TODO) YOUR NAME HERE -* Tested on: (TODO) Windows 22, i7-2222 @ 2.22GHz 22GB, GTX 222 222MB (Moore 2222 Lab) - -### (TODO: Your README) - -Include analysis, etc. (Remember, this is public, so don't put -anything here that you don't want to share with the world.) - -Instructions (delete me) -======================== - -This is due Sunday, September 13 at midnight. - -**Summary:** In this project, you'll implement GPU stream compaction in CUDA, -from scratch. This algorithm is widely used, and will be important for -accelerating your path tracer project. - -Your stream compaction implementations in this project will simply remove `0`s -from an array of `int`s. In the path tracer, you will remove terminated paths -from an array of rays. - -In addition to being useful for your path tracer, this project is meant to -reorient your algorithmic thinking to the way of the GPU. On GPUs, many -algorithms can benefit from massive parallelism and, in particular, data -parallelism: executing the same code many times simultaneously with different -data. - -You'll implement a few different versions of the *Scan* (*Prefix Sum*) -algorithm. First, you'll implement a CPU version of the algorithm to reinforce -your understanding. Then, you'll write a few GPU implementations: "naive" and -"work-efficient." Finally, you'll use some of these to implement GPU stream -compaction. - -**Algorithm overview & details:** There are two primary references for details -on the implementation of scan and stream compaction. - -* The [slides on Parallel Algorithms](https://github.com/CIS565-Fall-2015/cis565-fall-2015.github.io/raw/master/lectures/2-Parallel-Algorithms.pptx) - for Scan, Stream Compaction, and Work-Efficient Parallel Scan. -* GPU Gems 3, Chapter 39 - [Parallel Prefix Sum (Scan) with CUDA](http://http.developer.nvidia.com/GPUGems3/gpugems3_ch39.html). - -Your GPU stream compaction implementation will live inside of the -`stream_compaction` subproject. This way, you will be able to easily copy it -over for use in your GPU path tracer. - - -## Part 0: The Usual - -This project (and all other CUDA projects in this course) requires an NVIDIA -graphics card with CUDA capability. Any card with Compute Capability 2.0 -(`sm_20`) or greater will work. Check your GPU on this -[compatibility table](https://developer.nvidia.com/cuda-gpus). -If you do not have a personal machine with these specs, you may use those -computers in the Moore 100B/C which have supported GPUs. - -**HOWEVER**: If you need to use the lab computer for your development, you will -not presently be able to do GPU performance profiling. This will be very -important for debugging performance bottlenecks in your program. - -### Useful existing code - -* `stream_compaction/common.h` - * `checkCUDAError` macro: checks for CUDA errors and exits if there were any. - * `ilog2ceil(x)`: computes the ceiling of log2(x), as an integer. -* `main.cpp` - * Some testing code for your implementations. - - -## Part 1: CPU Scan & Stream Compaction - -This stream compaction method will remove `0`s from an array of `int`s. - -In `stream_compaction/cpu.cu`, implement: - -* `StreamCompaction::CPU::scan`: compute an exclusive prefix sum. -* `StreamCompaction::CPU::compactWithoutScan`: stream compaction without using - the `scan` function. -* `StreamCompaction::CPU::compactWithScan`: stream compaction using the `scan` - function. Map the input array to an array of 0s and 1s, scan it, and use - scatter to produce the output. You will need a **CPU** scatter implementation - for this (see slides or GPU Gems chapter for an explanation). - -These implementations should only be a few lines long. - - -## Part 2: Naive GPU Scan Algorithm - -In `stream_compaction/naive.cu`, implement `StreamCompaction::Naive::scan` - -This uses the "Naive" algorithm from GPU Gems 3, Section 39.2.1. We haven't yet -taught shared memory, and you **shouldn't use it yet**. Example 39-1 uses -shared memory, but is limited to operating on very small arrays! Instead, write -this using global memory only. As a result of this, you will have to do -`ilog2ceil(n)` separate kernel invocations. - -Beware of errors in Example 39-1 in the book; both the pseudocode and the CUDA -code in the online version of Chapter 39 are known to have a few small errors -(in superscripting, missing braces, bad indentation, etc.) - -Since the parallel scan algorithm operates on a binary tree structure, it works -best with arrays with power-of-two length. Make sure your implementation works -on non-power-of-two sized arrays (see `ilog2ceil`). This requires extra memory -- your intermediate array sizes will need to be rounded to the next power of -two. - - -## Part 3: Work-Efficient GPU Scan & Stream Compaction - -### 3.1. Scan - -In `stream_compaction/efficient.cu`, implement -`StreamCompaction::Efficient::scan` - -All of the text in Part 2 applies. - -* This uses the "Work-Efficient" algorithm from GPU Gems 3, Section 39.2.2. -* Beware of errors in Example 39-2. -* Test non-power-of-two sized arrays. - -### 3.2. Stream Compaction - -This stream compaction method will remove `0`s from an array of `int`s. - -In `stream_compaction/efficient.cu`, implement -`StreamCompaction::Efficient::compact` - -For compaction, you will also need to implement the scatter algorithm presented -in the slides and the GPU Gems chapter. - -In `stream_compaction/common.cu`, implement these for use in `compact`: - -* `StreamCompaction::Common::kernMapToBoolean` -* `StreamCompaction::Common::kernScatter` - - -## Part 4: Using Thrust's Implementation - -In `stream_compaction/thrust.cu`, implement: - -* `StreamCompaction::Thrust::scan` - -This should be a very short function which wraps a call to the Thrust library -function `thrust::exclusive_scan(first, last, result)`. - -To measure timing, be sure to exclude memory operations by passing -`exclusive_scan` a `thrust::device_vector` (which is already allocated on the -GPU). You can create a `thrust::device_vector` by creating a -`thrust::host_vector` from the given pointer, then casting it. - - -## Part 5: Radix Sort (Extra Credit) (+10) - -Add an additional module to the `stream_compaction` subproject. Implement radix -sort using one of your scan implementations. Add tests to check its correctness. - - -## Write-up - -1. Update all of the TODOs at the top of this README. -2. Add a description of this project including a list of its features. -3. Add your performance analysis (see below). - -All extra credit features must be documented in your README, explaining its -value (with performance comparison, if applicable!) and showing an example how -it works. For radix sort, show how it is called and an example of its output. - -Always profile with Release mode builds and run without debugging. - -### Questions - -* Roughly optimize the block sizes of each of your implementations for minimal - run time on your GPU. - * (You shouldn't compare unoptimized implementations to each other!) - -* Compare all of these GPU Scan implementations (Naive, Work-Efficient, and - Thrust) to the serial CPU version of Scan. Plot a graph of the comparison - (with array size on the independent axis). - * You should use CUDA events for timing. Be sure **not** to include any - explicit memory operations in your performance measurements, for - comparability. - * To guess at what might be happening inside the Thrust implementation, take - a look at the Nsight timeline for its execution. - -* Write a brief explanation of the phenomena you see here. - * Can you find the performance bottlenecks? Is it memory I/O? Computation? Is - it different for each implementation? - -* Paste the output of the test program into a triple-backtick block in your - README. - * If you add your own tests (e.g. for radix sort or to test additional corner - cases), be sure to mention it explicitly. - -These questions should help guide you in performance analysis on future -assignments, as well. - -## Submit - -If you have modified any of the `CMakeLists.txt` files at all (aside from the -list of `SOURCE_FILES`), you must test that your project can build in Moore -100B/C. Beware of any build issues discussed on the Google Group. - -1. Open a GitHub pull request so that we can see that you have finished. - The title should be "Submission: YOUR NAME". -2. Send an email to the TA (gmail: kainino1+cis565@) with: - * **Subject**: in the form of `[CIS565] Project 2: PENNKEY` - * Direct link to your pull request on GitHub - * In the form of a grade (0-100+) with comments, evaluate your own - performance on the project. - * Feedback on the project itself, if any. +* Tongbo Sui +* Tested on: Windows 10, i5-3320M @ 2.60GHz 8GB, NVS 5400M 2GB (Personal) + +## Project description + +* Scan: + * CPU scan & compaction + * Naive scan + * Work-efficient scan & compaction + +* Radix sort: + * A single block/tile implementation of radix sort. Splitting and merging are not implemented + * Call the function as: `count = RadixSort::sort(SIZE, c, a, 8);` + * The function will simply write sorted result to the result array, and return the array size. + +### Optimize block size +Block size is optimized based on fixed array size (256) and number of blocks. `[1,2,4,8,16]` blocks are tested on each implementation + * Naive scan: 4 blocks / 64 threads has the best execution time + * Work efficient scan: 2 blocks / 128 threads has the best execution time + +### Performance +All 4 implementations are ran on various array sizes from `2^4` to `2^18`. Due to memory constraints bigger array sizes are not available. + + * All performances + +![](images/all-performance.jpg) + + * Exclude CPU + +![](images/no-cpu-performance.jpg) + +* Observations + * Theoretically work-efficient scan should be faster than a naive scan. However somehow this is not the case in performance testing with small array sizes. A profile report shows the following which might contribute to the result: + * Up-sweep is generating 100% more memory transactions than a naive scan. Adding down-sweep process, work-efficient method has about 150% more than naive method. + * Up-sweep is executing 50% more integer operations than naive method. + * In general work-efficient method has 8% divergence, while naive method has none. + * In general naive implementation has a better Occupancy Per SM rate, in term of number of its active blocks against device limit. All these could contribute to a faster execution of naive scan, compared to current work-efficient implementation. + * Thrust seems to have the best performance among GPU implementations. From Nsight timeline we can see that it is splitting the problem into tiles and doing work-efficient scan. Thrust runs in 40 blocks with 128 threads each, which accounts for a size of 5120, which is about `2^12`. When array size grows big, building tiles and merging results would be a major bottleneck. + * As array size grows, GPU hardware resources get saturated. More blocks can only be scheduled sequentially, thereby reducing the performance significantly. On the other hand, issue efficiency drops, resulting in wasted cycles. + * With almost 0% cache hit rate, as array size increases, more and more time is needed for memory I/O operations. + * With all of the factors above combined, the CPU implementation is actually much faster than GPU in the final result. C++11 `chrono` library was only able to pick up the execution time in milliseconds when the CPU code is ran in another nested for-loop for about `2^8` times. The guess is that CPU can still have the best performance up to a point, where linear computation cost outweighs memory I/O time. + +* Test program output + * The extra output `XXX scan: 0.0000000` is used for profiling the execution time of each implementation + * Radix sort test is also printed, where the test is carried out on the array generated in stream compaction test, and compared against standard C++ vector sort + * Performance comparision is also printed. All 4 implementations are ran on various array sizes from `2^4` to `2^18` + +``` +**************** +** SCAN TESTS ** +**************** + [ 38 19 38 37 5 47 15 35 0 12 3 0 42 ... 26 0 ] +==== cpu scan, power-of-two ==== + [ 0 38 57 95 132 137 184 199 234 234 246 249 249 ... 6203 6229 ] +==== cpu scan, non-power-of-two ==== + [ 0 38 57 95 132 137 184 199 234 234 246 249 249 ... 6146 6190 ] + passed +==== naive scan, power-of-two ==== +Naive scan: 0.042304 + passed +==== naive scan, non-power-of-two ==== +Naive scan: 0.041088 + passed +==== work-efficient scan, power-of-two ==== +Work-efficient scan: 0.127552 + passed +==== work-efficient scan, non-power-of-two ==== +Work-efficient scan: 0.126304 + passed +==== thrust scan, power-of-two ==== +Thrust scan: 3.756288 + passed +==== thrust scan, non-power-of-two ==== +Thrust scan: 0.016544 + passed + +***************************** +** STREAM COMPACTION TESTS ** +***************************** + [ 2 3 2 1 3 1 1 1 2 0 1 0 2 ... 0 0 ] +==== cpu compact without scan, power-of-two ==== + [ 2 3 2 1 3 1 1 1 2 1 2 1 1 ... 2 1 ] + passed +==== cpu compact without scan, non-power-of-two ==== + [ 2 3 2 1 3 1 1 1 2 1 2 1 1 ... 3 2 ] + passed +==== cpu compact with scan ==== + [ 2 3 2 1 3 1 1 1 2 1 2 1 1 ... 2 1 ] + passed +==== work-efficient compact, power-of-two ==== +Work-efficient scan: 0.125088 + passed +==== work-efficient compact, non-power-of-two ==== +Work-efficient scan: 0.125120 + passed + +***************************** +** RADIX SORT (Single block/tile) TEST ** +***************************** +Thrust scan: 0.016512 +Thrust scan: 0.015072 +Thrust scan: 0.014912 +Thrust scan: 0.015008 +Thrust scan: 0.014912 +Thrust scan: 0.014944 +Thrust scan: 0.015008 +Thrust scan: 0.019840 +Radix sort: + [ 0 0 0 0 0 0 0 0 0 0 0 0 0 ... 3 3 ] +Std sort: + [ 0 0 0 0 0 0 0 0 0 0 0 0 0 ... 3 3 ] + passed + +***************************** +** SCAN PERFORMANCE ** +***************************** +==== Array size: 16 ==== +CPU scan: 0.000000 +Naive scan: 0.001536 +Work-efficient scan: 0.003136 +Thrust scan: 0.016672 + +==== Array size: 32 ==== +CPU scan: 0.000000 +Naive scan: 0.001536 +Work-efficient scan: 0.003040 +Thrust scan: 0.016512 + +==== Array size: 64 ==== +CPU scan: 0.000000 +Naive scan: 0.029120 +Work-efficient scan: 0.003104 +Thrust scan: 0.016416 + +==== Array size: 128 ==== +CPU scan: 0.000000 +Naive scan: 0.033376 +Work-efficient scan: 0.111296 +Thrust scan: 0.016512 + +==== Array size: 256 ==== +CPU scan: 0.000000 +Naive scan: 0.041056 +Work-efficient scan: 0.126304 +Thrust scan: 0.016512 + +==== Array size: 512 ==== +CPU scan: 0.000000 +Naive scan: 0.063456 +Work-efficient scan: 0.194720 +Thrust scan: 0.016736 + +==== Array size: 1024 ==== +CPU scan: 0.000000 +Naive scan: 0.112128 +Work-efficient scan: 0.352768 +Thrust scan: 0.017056 + +==== Array size: 2048 ==== +CPU scan: 0.000000 +Naive scan: 0.216704 +Work-efficient scan: 0.685760 +Thrust scan: 0.020064 + +==== Array size: 4096 ==== +CPU scan: 0.000000 +Naive scan: 0.439424 +Work-efficient scan: 1.365280 +Thrust scan: 0.024992 + +==== Array size: 8192 ==== +CPU scan: 0.000000 +Naive scan: 0.915264 +Work-efficient scan: 2.785376 +Thrust scan: 0.038912 + +==== Array size: 16384 ==== +CPU scan: 0.000000 +Naive scan: 1.931072 +Work-efficient scan: 5.762080 +Thrust scan: 0.067264 + +==== Array size: 32768 ==== +CPU scan: 0.000000 +Naive scan: 4.107552 +Work-efficient scan: 11.998688 +Thrust scan: 0.393440 + +==== Array size: 65536 ==== +CPU scan: 0.000000 +Naive scan: 8.706624 +Work-efficient scan: 24.970079 +Thrust scan: 0.439232 + +==== Array size: 131072 ==== +CPU scan: 0.000000 +Naive scan: 18.445057 +Work-efficient scan: 51.995682 +Thrust scan: 0.477088 + +==== Array size: 262144 ==== +CPU scan: 0.000000 +Naive scan: 38.985985 +Work-efficient scan: 108.208801 +Thrust scan: 2.610464 +``` diff --git a/images/all-performance.jpg b/images/all-performance.jpg new file mode 100644 index 0000000..f37543b Binary files /dev/null and b/images/all-performance.jpg differ diff --git a/images/custom-only-performance.jpg b/images/custom-only-performance.jpg new file mode 100644 index 0000000..ac79a3b Binary files /dev/null and b/images/custom-only-performance.jpg differ diff --git a/images/no-cpu-performance.jpg b/images/no-cpu-performance.jpg new file mode 100644 index 0000000..fef1806 Binary files /dev/null and b/images/no-cpu-performance.jpg differ diff --git a/src/main.cpp b/src/main.cpp index 675da35..7d63754 100644 --- a/src/main.cpp +++ b/src/main.cpp @@ -11,7 +11,11 @@ #include #include #include +#include +#include // std::sort +#include // std::vector #include "testing_helpers.hpp" +#include int main(int argc, char* argv[]) { const int SIZE = 1 << 8; @@ -39,7 +43,7 @@ int main(int argc, char* argv[]) { StreamCompaction::CPU::scan(NPOT, c, a); printArray(NPOT, b, true); printCmpResult(NPOT, b, c); - + zeroArray(SIZE, c); printDesc("naive scan, power-of-two"); StreamCompaction::Naive::scan(SIZE, c, a); @@ -63,7 +67,7 @@ int main(int argc, char* argv[]) { StreamCompaction::Efficient::scan(NPOT, c, a); //printArray(NPOT, c, true); printCmpResult(NPOT, b, c); - + zeroArray(SIZE, c); printDesc("thrust scan, power-of-two"); StreamCompaction::Thrust::scan(SIZE, c, a); @@ -82,13 +86,11 @@ int main(int argc, char* argv[]) { printf("*****************************\n"); // Compaction tests - genArray(SIZE - 1, a, 4); // Leave a 0 at the end to test that edge case a[SIZE - 1] = 0; printArray(SIZE, a, true); int count, expectedCount, expectedNPOT; - zeroArray(SIZE, b); printDesc("cpu compact without scan, power-of-two"); count = StreamCompaction::CPU::compactWithoutScan(SIZE, b, a); @@ -120,4 +122,59 @@ int main(int argc, char* argv[]) { count = StreamCompaction::Efficient::compact(NPOT, c, a); //printArray(count, c, true); printCmpLenResult(count, expectedNPOT, b, c); + + printf("\n"); + printf("*****************************\n"); + printf("** RADIX SORT (Single block/tile) TEST **\n"); + printf("*****************************\n"); + + zeroArray(SIZE, c); + count = RadixSort::sort(SIZE, c, a, 8); + + std::vector s(a, a + SIZE); + std::sort(s.begin(), s.end()); + + int d[SIZE]; + for (int i = 0; i < SIZE; i++){ + d[i] = s[i]; + } + + printf("Radix sort:\n"); + printArray(count, c, true); + printf("Std sort:\n"); + printArray(count, d, true); + printCmpLenResult(count, count, d, c); + + printf("\n"); + printf("*****************************\n"); + printf("** SCAN PERFORMANCE **\n"); + printf("*****************************\n"); + + for (int s = 4; s < 19; s++){ + int ssize = 1 << s; + int *u, *v; + u = (int *)malloc(ssize*sizeof(int)); + v = (int *)malloc(ssize*sizeof(int)); + printf("==== Array size: %d ====\n", ssize); + genArray(ssize, u, 50); + + zeroArray(ssize, v); + auto start = std::chrono::high_resolution_clock::now(); + StreamCompaction::CPU::scan(ssize, v, u); + auto end = std::chrono::high_resolution_clock::now(); + double diff = std::chrono::duration_cast(end-start).count(); + printf("CPU scan: %f\n", diff); + + zeroArray(ssize, v); + StreamCompaction::Naive::scan(ssize, v, u); + + zeroArray(ssize, v); + StreamCompaction::Efficient::scan(ssize, v, u); + + zeroArray(ssize, v); + StreamCompaction::Thrust::scan(ssize, v, u); + free(u); + free(v); + printf("\n"); + } } diff --git a/stream_compaction/CMakeLists.txt b/stream_compaction/CMakeLists.txt index cdbef77..bcc484e 100644 --- a/stream_compaction/CMakeLists.txt +++ b/stream_compaction/CMakeLists.txt @@ -9,6 +9,8 @@ set(SOURCE_FILES "efficient.cu" "thrust.h" "thrust.cu" + "radix.h" + "radix.cu" ) cuda_add_library(stream_compaction diff --git a/stream_compaction/cpu.cu b/stream_compaction/cpu.cu index e600c29..27bc8b8 100644 --- a/stream_compaction/cpu.cu +++ b/stream_compaction/cpu.cu @@ -1,15 +1,19 @@ #include #include "cpu.h" +#include namespace StreamCompaction { namespace CPU { /** - * CPU scan (prefix sum). + * CPU scan (exclusive prefix sum). */ void scan(int n, int *odata, const int *idata) { // TODO - printf("TODO\n"); + odata[0] = 0; + for (int i = 1; i < n; i++){ + odata[i] = odata[i - 1] + idata[i - 1]; + } } /** @@ -18,8 +22,16 @@ void scan(int n, int *odata, const int *idata) { * @returns the number of elements remaining after compaction. */ int compactWithoutScan(int n, int *odata, const int *idata) { - // TODO - return -1; + int count = 0; + int j = 0; + for (int i = 0; i < n; i++){ + if (idata[i] != 0){ + count++; + odata[j] = idata[i]; + j++; + } + } + return count; } /** @@ -28,8 +40,35 @@ int compactWithoutScan(int n, int *odata, const int *idata) { * @returns the number of elements remaining after compaction. */ int compactWithScan(int n, int *odata, const int *idata) { - // TODO - return -1; + int *c; + c = (int *)malloc(n * sizeof(int)); + for (int i = 0; i < n; i++){ + if (idata[i] != 0){ + c[i] = 1; + } + else { + c[i] = 0; + } + } + int *d, *e; + d = (int *)malloc(n * sizeof(int)); + + scan(n, d, c); + + for (int i = 0; i < n; i++){ + if (c[i] == 1){ + odata[d[i]] = idata[i]; + } + } + free(c); + free(d); + int count = 0; + for (int i = 0; i < n; i++){ + if (odata[i] != 0){ + count++; + } + } + return count; } } diff --git a/stream_compaction/efficient.cu b/stream_compaction/efficient.cu index b2f739b..79edaec 100644 --- a/stream_compaction/efficient.cu +++ b/stream_compaction/efficient.cu @@ -6,14 +6,93 @@ namespace StreamCompaction { namespace Efficient { -// TODO: __global__ + __global__ void scanUp(int d, int *idata){ + int k = blockIdx.x*blockDim.x + threadIdx.x; + if (k % (int)pow((double)2, (double)(d + 1)) == 0){ + idata[k - 1 + (int)pow((double)2, (double)(d + 1))] += idata[k - 1 + (int)pow((double)2, (double)d)]; + } + } + + __global__ void scanDown(int d, int *idata){ + int k = blockIdx.x*blockDim.x + threadIdx.x; + if (k % (int)pow((double)2, (double)(d + 1)) == 0){ + int t = idata[k - 1 + (int)pow((double)2, (double)d)]; + idata[k - 1 + (int)pow((double)2, (double)d)] = idata[k - 1 + (int)pow((double)2, (double)(d + 1))]; + idata[k - 1 + (int)pow((double)2, (double)(d + 1))] += t; + } + } + + __global__ void filter(int *odata, int *idata){ + int k = blockIdx.x*blockDim.x + threadIdx.x; + if (idata[k] == 0){ + odata[k] = 0; + } + else { + odata[k] = 1; + } + } + + __global__ void scatter(int *odata, int *idata, int *filter, int *idx){ + int k = blockIdx.x*blockDim.x + threadIdx.x; + if (filter[k] == 1){ + odata[idx[k]] = idata[k]; + } + } /** * Performs prefix-sum (aka scan) on idata, storing the result into odata. */ void scan(int n, int *odata, const int *idata) { - // TODO - printf("TODO\n"); + // Padding + int m = (int)pow((double)2, (double)ilog2ceil(n)); + int *pidata; + pidata = (int*)malloc(m*sizeof(int)); + for (int i = 0; i < n; i++){ + pidata[i] = idata[i]; + } + if (m > n){ + for (int i = n; i < m; i++){ + pidata[i] = 0; + } + } + int *dev_pidata; + cudaMalloc((void **)&dev_pidata, m*sizeof(int)); + cudaMemcpy(dev_pidata, pidata, m*sizeof(int), cudaMemcpyHostToDevice); + + cudaEvent_t start, stop; + cudaEventCreate(&start); + cudaEventCreate(&stop); + + // Scan + cudaEventRecord(start); + for (int d = 0; d < ilog2ceil(m); d++){ + scanUp << > >(d, dev_pidata); + } + cudaEventRecord(stop); + cudaEventSynchronize(stop); + float ms1 = 0; + cudaEventElapsedTime(&ms1, start, stop); + + cudaMemcpy(pidata, dev_pidata, m*sizeof(int), cudaMemcpyDeviceToHost); + pidata[m - 1] = 0; + cudaMemcpy(dev_pidata, pidata, m*sizeof(int), cudaMemcpyHostToDevice); + + cudaEventRecord(start); + for (int d = ilog2ceil(m)-1; d >=0; d--){ + scanDown<<>>(d, dev_pidata); + } + cudaEventRecord(stop); + cudaEventSynchronize(stop); + float ms2 = 0; + cudaEventElapsedTime(&ms2, start, stop); + printf("Work-efficient scan: %f\n", (ms1+ms2)); + + cudaMemcpy(pidata, dev_pidata, m*sizeof(int), cudaMemcpyDeviceToHost); + for (int i = 0; i < n; i++){ + odata[i] = pidata[i]; + } + cudaFree(dev_pidata); + free(pidata); } /** @@ -26,8 +105,43 @@ void scan(int n, int *odata, const int *idata) { * @returns The number of elements remaining after compaction. */ int compact(int n, int *odata, const int *idata) { - // TODO - return -1; + int *f; + int *dev_idata; + cudaMalloc((void**)&f, n * sizeof(int)); + cudaMalloc((void**)&dev_idata, n * sizeof(int)); + cudaMemcpy(dev_idata, idata, n * sizeof(int), cudaMemcpyHostToDevice); + filter << <1, n >> >(f, dev_idata); + + int *hs_idx; + int *hs_f; + hs_idx = (int *)malloc(n * sizeof(int)); + hs_f = (int *)malloc(n*sizeof(int)); + cudaMemcpy(hs_f, f, n * sizeof(int), cudaMemcpyDeviceToHost); + scan(n, hs_idx, hs_f); + + int *idx; + int *dv_out; + cudaMalloc((void**)&idx, n * sizeof(int)); + cudaMalloc((void**)&dv_out, n * sizeof(int)); + cudaMemcpy(idx, hs_idx, n * sizeof(int), cudaMemcpyHostToDevice); + scatter << <1, n >> >(dv_out, dev_idata, f, idx); + + cudaMemcpy(odata, dv_out, n * sizeof(int), cudaMemcpyDeviceToHost); + + cudaFree(f); + cudaFree(dev_idata); + cudaFree(idx); + cudaFree(dv_out); + free(hs_idx); + free(hs_f); + + int count = 0; + for (int i = 0; i < n; i++){ + if (odata[i] != 0){ + count++; + } + } + return count; } } diff --git a/stream_compaction/naive.cu b/stream_compaction/naive.cu index 3d86b60..a33d29c 100644 --- a/stream_compaction/naive.cu +++ b/stream_compaction/naive.cu @@ -2,18 +2,61 @@ #include #include "common.h" #include "naive.h" +#include namespace StreamCompaction { namespace Naive { -// TODO: __global__ + __global__ void scanCol(int d, int *idata){ + int k = blockIdx.x*blockDim.x + threadIdx.x; + if (k >= (int)pow((double)2, (double)(d-1))){ + idata[k] = idata[k - (int)pow((double)2, (double)(d - 1))] + idata[k]; + } + } /** * Performs prefix-sum (aka scan) on idata, storing the result into odata. */ void scan(int n, int *odata, const int *idata) { - // TODO - printf("TODO\n"); + // Padding + int m = (int)pow((double)2, (double)ilog2ceil(n)); + int *pidata; + pidata = (int*)malloc(m*sizeof(int)); + for (int i = 0; i < n; i++){ + pidata[i] = idata[i]; + } + if (m > n){ + for (int i = n; i < m; i++){ + pidata[i] = 0; + } + } + int *dev_pidata; + cudaMalloc((void **)&dev_pidata, m*sizeof(int)); + cudaMemcpy(dev_pidata, pidata, m*sizeof(int), cudaMemcpyHostToDevice); + + cudaEvent_t start, stop; + cudaEventCreate(&start); + cudaEventCreate(&stop); + + // Scan + cudaEventRecord(start); + for (int d = 1; d <= ilog2ceil(m); d++){ + scanCol<<>>(d, dev_pidata); + } + cudaEventRecord(stop); + cudaEventSynchronize(stop); + + float msAdd = 0; + cudaEventElapsedTime(&msAdd, start, stop); + printf("Naive scan: %f\n", msAdd); + + cudaMemcpy(pidata, dev_pidata, m*sizeof(int), cudaMemcpyDeviceToHost); + odata[0] = 0; + for (int i = 1; i < n; i++){ + odata[i] = pidata[i-1]; + } + cudaFree(dev_pidata); + free(pidata); } } diff --git a/stream_compaction/radix.cu b/stream_compaction/radix.cu new file mode 100644 index 0000000..26386c6 --- /dev/null +++ b/stream_compaction/radix.cu @@ -0,0 +1,86 @@ +#include +#include +#include "common.h" +#include "radix.h" +#include + +namespace RadixSort { + + __global__ void getTotalFalse(int *oB, int *oE, const int *idata, const int currentPass, const int n){ + int k = (blockIdx.x*blockDim.x + threadIdx.x) % n; + + int digit = 0; + int dec = idata[k]; + for (int i = 0; i <= currentPass; i++){ + digit = dec % 2; + dec = dec / 2; + } + + if (digit == 0){ + oE[k] = 1; + oB[k] = 0; + } else { + oB[k] = 1; + oE[k] = 0; + } + } + + __global__ void getT(int *oT, const int *iF, const int totalFalses, const int n){ + int k = (blockIdx.x*blockDim.x + threadIdx.x) % n; + oT[k] = k - iF[k] + totalFalses; + } + + __global__ void rearrange(int *odata, int *idata, int *oB, int *oT, int *oF, const int n){ + int k = (blockIdx.x*blockDim.x + threadIdx.x) % n; + int d = oB[k] == 1 ? oT[k] : oF[k]; + odata[d] = idata[k]; + } + +int sort(int n, int *odata, const int *idata, const int passes) { + int blockSize = 64; + int gridSize = ceil(n / blockSize); + + int *dv_in, *dv_out; + cudaMalloc((void**)&dv_in, n*sizeof(int)); + cudaMalloc((void**)&dv_out, n*sizeof(int)); + cudaMemcpy(dv_in, idata, n*sizeof(int), cudaMemcpyHostToDevice); + + int *dv_b, *dv_e, *dv_t, *dv_f; + cudaMalloc((void**)&dv_b, n*sizeof(int)); + cudaMalloc((void**)&dv_e, n*sizeof(int)); + cudaMalloc((void**)&dv_t, n*sizeof(int)); + cudaMalloc((void**)&dv_f, n*sizeof(int)); + + int *hs_e, *hs_f; + hs_e = (int *)malloc(n*sizeof(int)); + hs_f = (int *)malloc(n*sizeof(int)); + for (int p = 0; p >>(dv_b, dv_e, dv_in, p, n); + + cudaMemcpy(hs_e, dv_e, n*sizeof(int), cudaMemcpyDeviceToHost); + StreamCompaction::Thrust::scan(n, hs_f, hs_e); + cudaMemcpy(dv_f, hs_f, n*sizeof(int), cudaMemcpyHostToDevice); + + int totalFalse = hs_e[n - 1] + hs_f[n - 1]; + + getT<<<1, n >>>(dv_t, dv_f, totalFalse, n); + + rearrange << <1, n >> >(dv_out, dv_in, dv_b, dv_t, dv_f, n); + cudaMemcpy(dv_in, dv_out, n*sizeof(int), cudaMemcpyDeviceToDevice); + } + + cudaMemcpy(odata, dv_out, n*sizeof(int), cudaMemcpyDeviceToHost); + + cudaFree(dv_in); + cudaFree(dv_out); + cudaFree(dv_b); + cudaFree(dv_e); + cudaFree(dv_t); + cudaFree(dv_f); + free(hs_e); + free(hs_f); + + return n; +} + +} diff --git a/stream_compaction/radix.h b/stream_compaction/radix.h new file mode 100644 index 0000000..cb828a7 --- /dev/null +++ b/stream_compaction/radix.h @@ -0,0 +1,5 @@ +#pragma once + +namespace RadixSort { + int sort(int n, int *odata, const int *idata, const int passes); +} diff --git a/stream_compaction/thrust.cu b/stream_compaction/thrust.cu index d8dbb32..5eb5d66 100644 --- a/stream_compaction/thrust.cu +++ b/stream_compaction/thrust.cu @@ -13,9 +13,35 @@ namespace Thrust { * Performs prefix-sum (aka scan) on idata, storing the result into odata. */ void scan(int n, int *odata, const int *idata) { - // TODO use `thrust::exclusive_scan` + // `thrust::exclusive_scan` // example: for device_vectors dv_in and dv_out: // thrust::exclusive_scan(dv_in.begin(), dv_in.end(), dv_out.begin()); + thrust::host_vector hs_in(n); + thrust::host_vector hs_out(n); + thrust::device_vector dv_in(n); + thrust::device_vector dv_out(n); + for (int i = 0; i < n; i++){ + hs_in[i] = idata[i]; + } + dv_in = hs_in; + + cudaEvent_t start, stop; + cudaEventCreate(&start); + cudaEventCreate(&stop); + + cudaEventRecord(start); + thrust::exclusive_scan(dv_in.begin(), dv_in.end(), dv_out.begin()); + cudaEventRecord(stop); + cudaEventSynchronize(stop); + + float msAdd = 0; + cudaEventElapsedTime(&msAdd, start, stop); + printf("Thrust scan: %f\n", msAdd); + + hs_out = dv_out; + for (int i = 0; i < n; i++){ + odata[i] = hs_out[i]; + } } }