diff --git a/CMakeLists.txt b/CMakeLists.txt
index 76a65a0..c6816e2 100644
--- a/CMakeLists.txt
+++ b/CMakeLists.txt
@@ -67,7 +67,7 @@ if(${CMAKE_SYSTEM_NAME} MATCHES "Darwin")
     set(CUDA_PROPAGATE_HOST_FLAGS OFF)
 endif()
 
-#add_subdirectory(stream_compaction)  # TODO: uncomment if using your stream compaction
+add_subdirectory(stream_compaction)  # TODO: uncomment if using your stream compaction
 add_subdirectory(src)
 
 cuda_add_executable(${CMAKE_PROJECT_NAME}
@@ -77,7 +77,7 @@ cuda_add_executable(${CMAKE_PROJECT_NAME}
 
 target_link_libraries(${CMAKE_PROJECT_NAME}
     src
-    #stream_compaction  # TODO: uncomment if using your stream compaction
+    stream_compaction  # TODO: uncomment if using your stream compaction
     ${CORELIBS}
     )
 
diff --git a/README.md b/README.md
index 0cc8122..144440f 100644
--- a/README.md
+++ b/README.md
@@ -3,48 +3,48 @@ CUDA Path Tracer
 
 **University of Pennsylvania, CIS 565: GPU Programming and Architecture, Project 3**
 
-* (TODO) YOUR NAME HERE
-* Tested on: (TODO) Windows 22, i7-2222 @ 2.22GHz 22GB, GTX 222 222MB (Moore 2222 Lab)
+* Guan Sun
+* Tested on: Mac OS X Yosemite 10.10.5, Intel Core i7 @ 2.3GHz 8GB, GeForce GT 650M 1024MB (Personal Laptop)
 
-### (TODO: Your README)
+## Project Description:
 
-*DO NOT* leave the README to the last minute! It is a crucial part of the
-project, and we will not be able to grade you without a good README.
+In this project, a CUDA-based path tracer capable of rendering globally-illuminated images is implemented. The features include,
+* Diffuse surfaces rendering
+* Perfectly specular-reflective (mirrored) surfaces rendering
+* Non-perfect specular surfaces rendering
+* Antialiasing
+* Motion blur
+* Work-efficient stream compaction using shared memory
 
-Instructions (delete me)
-========================
+### Diffuse surface
 
-This is **NOW** due ~~Thursday, September 24~~ **Tuesday, September 29** evening at midnight.
+![](img/diffuse.png)
 
-**Summary:**
-In this project, you'll implement a CUDA-based path tracer capable of rendering
-globally-illuminated images very quickly.
-Since in this class we are concerned with working in GPU programming,
-performance, and the generation of actual beautiful images (and not with
-mundane programming tasks like I/O), this project includes base code for
-loading a scene description file, described below, and various other things
-that generally make up a framework for previewing and saving images.
+### Perfectly specular-reflective surface
 
-The core renderer is left for you to implement. Finally, note that, while this
-base code is meant to serve as a strong starting point for a CUDA path tracer,
-you are not required to use it if you don't want to. You may also change any
-part of the base code as you please. **This is YOUR project.**
+![](img/specular.png)
 
-**Recommendation:** Every image you save should automatically get a different
-filename. Don't delete all of them! For the benefit of your README, keep a
-bunch of them around so you can pick a few to document your progress at the
-end.
+### Non-perfect specular surface
 
-### Contents
+![](img/nonperfect.png)
 
-* `src/` C++/CUDA source files.
-* `scenes/` Example scene description files.
-* `img/` Renders of example scene description files.
-  (These probably won't match precisely with yours.)
-* `external/` Includes and static libraries for 3rd party libraries.
+### Antialiasing
 
+![](img/antialias.png)
 
-### Running the code
+### Motion blur
+
+![](img/blur.png)
+
+### Work-efficient stream compaction
+
+![](img/compaction.png)
+
+Work-efficient stream compaction removes all the terminated rays to improve the rendering performance. This figure shows how the number of
+active rays changing as trace depth increases in one single iteration. From the figure we can come to the conclusion that stream compaction only improves the rendering performance of open scenes since in closed scenes the rays will never terminate before reaching the maximum trace depth.
+
+
+## Running the code
 
 The main function requires a scene description file. Call the program with
 one as an argument: `cis565_path_tracer scenes/sphere.txt`.
@@ -54,150 +54,13 @@ If you are using Visual Studio, you can set this in the Debugging > Command
 Arguments section in the Project properties. Make sure you get the path right -
 read the console for errors.
 
-#### Controls
+### Controls
 
 * Esc to save an image and exit.
 * Space to save an image. Watch the console for the output filename.
 * W/A/S/D and R/F move the camera. Arrow keys rotate.
 
-## Requirements
-
-**Ask on the mailing list for clarifications.**
-
-In this project, you are given code for:
-
-* Loading and reading the scene description format
-* Sphere and box intersection functions
-* Support for saving images
-* Working CUDA-GL interop for previewing your render while it's running
-* A function which generates random screen noise (instead of an actual render).
-
-You will need to implement the following features:
-
-* Raycasting from the camera into the scene through an imaginary grid of pixels
-  (the screen).
-  * Implement simple antialiasing (by jittering rays within each pixel).
-* Diffuse surfaces (using provided cosine-weighted scatter function) [PBRT 8.3].
-* Perfectly specular-reflective (mirrored) surfaces.
-  * See notes on diffuse/specular in `scatterRay` and on imperfect specular below.
-* Stream compaction optimization, using:
-* **NEWLY ADDED:** Work-efficient stream compaction using shared memory across
-  multiple blocks. (See
-  [*GPU Gems 3*, Chapter 39](http://http.developer.nvidia.com/GPUGems3/gpugems3_ch39.html).)
-
-You are also required to implement at least 2 of the following features. Please
-ask if you need good references. If you find good references, share them!
-**Extra credit**: implement more features on top of the 2 required ones,
-with point value up to +20/100 atthe grader's discretion
-(based on difficulty and coolness).
-
-* **NOW REQUIRED - NOT AN EXTRA:** ~~Work-efficient stream compaction (see above).~~
-* These 2 smaller features:
-  * Refraction (e.g. glass/water) [PBRT 8.2] with Frensel effects using
-    [Schlick's approximation](https://en.wikipedia.org/wiki/Schlick's_approximation)
-    or more accurate methods [PBRT 8.5].
-  * Physically-based depth-of-field (by jittering rays within an aperture)
-    [PBRT 6.2.3].
-  * Recommended but not required: non-perfect specular surfaces. (See below.)
-* Texture mapping [PBRT 10.4].
-* Bump mapping [PBRT 9.3].
-* Direct lighting (by taking a final ray directly to a random point on an
-  emissive object acting as a light source). Or more advanced [PBRT 15.1.1].
-* Some method of defining object motion, and motion blur by averaging samples
-  at different times in the animation.
-* Subsurface scattering [PBRT 5.6.2, 11.6].
-* Arbitrary mesh loading and rendering (e.g. `obj` files). You can find these
-  online or export them from your favorite 3D modeling application.
-  With approval, you may use a third-party OBJ loading code to bring the data
-  into C++.
-  * You can use the triangle intersection function `glm::intersectRayTriangle`.
-
-This 'extra features' list is not comprehensive. If you have a particular idea
-you would like to implement (e.g. acceleration structures, etc.), please
-contact us first.
-
-For each extra feature, you must provide the following analysis:
-
-* Overview write-up of the feature
-* Performance impact of the feature
-* If you did something to accelerate the feature, what did you do and why?
-* Compare your GPU version of the feature to a HYPOTHETICAL CPU version
-  (you don't have to implement it!) Does it benefit or suffer from being
-  implemented on the GPU?
-* How might this feature be optimized beyond your current implementation?
-
-## Base Code Tour
-
-You'll be working in the following files. Look for important parts of the code:
-search for `CHECKITOUT`. You'll have to implement parts labeled with `TODO`.
-(But don't let these constrain you - you have free rein!)
-
-* `src/pathtrace.cu`: path tracing kernels, device functions, and calling code
-  * `pathtraceInit` initializes the path tracer state - it should copy
-    scene data (e.g. geometry, materials) from `Scene`.
-  * `pathtraceFree` frees memory allocated by `pathtraceInit`
-  * `pathtrace` performs one iteration of the rendering - it handles kernel
-    launches, memory copies, transferring some data, etc.
-    * See comments for a low-level path tracing recap.
-* `src/intersections.h`: ray intersection functions
-  * `boxIntersectionTest` and `sphereIntersectionTest`, which take in a ray and
-    a geometry object and return various properties of the intersection.
-* `src/interactions.h`: ray scattering functions
-  * `calculateRandomDirectionInHemisphere`: a cosine-weighted random direction
-    in a hemisphere. Needed for implementing diffuse surfaces.
-  * `scatterRay`: this function should perform all ray scattering, and will
-    call `calculateRandomDirectionInHemisphere`. See comments for details.
-* `src/main.cpp`: you don't need to do anything here, but you can change the
-  program to save `.hdr` image files, if you want (for postprocessing).
-
-### Generating random numbers
-
-```
-thrust::default_random_engine rng(hash(index));
-thrust::uniform_real_distribution<float> u01(0, 1);
-float result = u01(rng);
-```
-
-There is a convenience function for generating a random engine using a
-combination of index, iteration, and depth as the seed:
-
-```
-thrust::default_random_engine rng = random_engine(iter, index, depth);
-```
-
-### Imperfect specular lighting
-
-In path tracing, like diffuse materials, specular materials are
-simulated using a probability distribution instead computing the
-strength of a ray bounce based on angles.
-
-Equations 7, 8, and 9 of
-[*GPU Gems 3*, Chapter 20](http://http.developer.nvidia.com/GPUGems3/gpugems3_ch20.html)
-give the formulas for generating a random specular ray. (Note that
-there is a typographical error: χ in the text = ξ in the formulas.)
-
-Also see the notes in `scatterRay` for probability splits between
-diffuse/specular/other material types.
-
-See also: PBRT 8.2.2.
-
-### Handling Long-Running CUDA Threads
-
-By default, your GPU driver will probably kill a CUDA kernel if it runs for more than 5 seconds. There's a way to disable this timeout. Just beware of infinite loops - they may lock up your computer.
-
-> The easiest way to disable TDR for Cuda programming, assuming you have the NVIDIA Nsight tools installed, is to open the Nsight Monitor, click on "Nsight Monitor options", and under "General" set "WDDM TDR enabled" to false. This will change the registry setting for you. Close and reboot. Any change to the TDR registry setting won't take effect until you reboot. [Stack Overflow](http://stackoverflow.com/questions/497685/cuda-apps-time-out-fail-after-several-seconds-how-to-work-around-this)
-
-### Notes on GLM
-
-This project uses GLM for linear algebra.
-
-On NVIDIA cards pre-Fermi (pre-DX12), you may have issues with mat4-vec4
-multiplication. If you have one of these cards, be careful! If you have issues,
-you might need to grab `cudamat4` and `multiplyMV` from the
-[Fall 2014 project](https://github.com/CIS565-Fall-2014/Project3-Pathtracer).
-Let us know if you need to do this.
-
-### Scene File Format
+## Scene File Format
 
 This project uses a custom scene description format. Scene files are flat text
 files that describe all geometry, materials, lights, cameras, and render
@@ -239,75 +102,17 @@ Objects are defined in the following fashion:
 * TRANS (float transx) (float transy) (float transz) //translation
 * ROTAT (float rotationx) (float rotationy) (float rotationz) //rotation
 * SCALE (float scalex) (float scaley) (float scalez) //scale
+* MOVING //define whether the object is moving or not
+* TRANSGOAL (float transx) (float transy) (float transz) //translation of the movement goal
+* FRAMES (int) //the frame span of the movement
 
-Two examples are provided in the `scenes/` directory: a single emissive sphere,
+Examples are provided in the `scenes/` directory: a single emissive sphere,
 and a simple cornell box made using cubes for walls and lights and a sphere in
 the middle.
 
-## Third-Party Code Policy
-
-* Use of any third-party code must be approved by asking on our Google Group.
-* If it is approved, all students are welcome to use it. Generally, we approve
-  use of third-party code that is not a core part of the project. For example,
-  for the path tracer, we would approve using a third-party library for loading
-  models, but would not approve copying and pasting a CUDA function for doing
-  refraction.
-* Third-party code **MUST** be credited in README.md.
-* Using third-party code without its approval, including using another
-  student's code, is an academic integrity violation, and will, at minimum,
-  result in you receiving an F for the semester.
-
-## README
-
-Please see: [**TIPS FOR WRITING AN AWESOME README**](https://github.com/pjcozzi/Articles/blob/master/CIS565/GitHubRepo/README.md)
-
-* Sell your project.
-* Assume the reader has a little knowledge of path tracing - don't go into
-  detail explaining what it is. Focus on your project.
-* Don't talk about it like it's an assignment - don't say what is and isn't
-  "extra" or "extra credit." Talk about what you accomplished.
-* Use this to document what you've done.
-* *DO NOT* leave the README to the last minute! It is a crucial part of the
-  project, and we will not be able to grade you without a good README.
-
-In addition:
-
-* This is a renderer, so include images that you've made!
-* Be sure to back your claims for optimization with numbers and comparisons.
-* If you reference any other material, please provide a link to it.
-* You wil not be graded on how fast your path tracer runs, but getting close to
-  real-time is always nice!
-* If you have a fast GPU renderer, it is very good to show case this with a
-  video to show interactivity. If you do so, please include a link!
-
-### Analysis
-
-* Stream compaction helps most after a few bounces. Print and plot the
-  effects of stream compaction within a single iteration (i.e. the number of
-  unterminated rays after each bounce) and evaluate the benefits you get from
-  stream compaction.
-* Compare scenes which are open (like the given cornell box) and closed
-  (i.e. no light can escape the scene). Again, compare the performance effects
-  of stream compaction! Remember, stream compaction only affects rays which
-  terminate, so what might you expect?
-
-
-## Submit
-
-If you have modified any of the `CMakeLists.txt` files at all (aside from the
-list of `SOURCE_FILES`), you must test that your project can build in Moore
-100B/C. Beware of any build issues discussed on the Google Group.
-
-1. Open a GitHub pull request so that we can see that you have finished.
-   The title should be "Submission: YOUR NAME".
-2. Send an email to the TA (gmail: kainino1+cis565@) with:
-   * **Subject**: in the form of `[CIS565] Project N: PENNKEY`.
-   * Direct link to your pull request on GitHub.
-   * Estimate the amount of time you spent on the project.
-   * If there were any outstanding problems, or if you did any extra
-     work, *briefly* explain.
-   * Feedback on the project itself, if any.
-
 ## References
 
 * [PBRT] Physically Based Rendering, Second Edition: From Theory To Implementation. Pharr, Matt and Humphreys, Greg. 2010.
+* Upenn CIS565 lecture slides by Patrick Cozzi
+* http://http.developer.nvidia.com/GPUGems3/gpugems3_ch20.html
+* http://http.developer.nvidia.com/GPUGems3/gpugems3_ch39.html
diff --git a/img/antialias.png b/img/antialias.png
new file mode 100644
index 0000000..3a76fa1
Binary files /dev/null and b/img/antialias.png differ
diff --git a/img/blur.png b/img/blur.png
new file mode 100644
index 0000000..cc483fe
Binary files /dev/null and b/img/blur.png differ
diff --git a/img/compaction.png b/img/compaction.png
new file mode 100644
index 0000000..d1add1b
Binary files /dev/null and b/img/compaction.png differ
diff --git a/img/diffuse.png b/img/diffuse.png
new file mode 100644
index 0000000..759ce24
Binary files /dev/null and b/img/diffuse.png differ
diff --git a/img/nonperfect.png b/img/nonperfect.png
new file mode 100644
index 0000000..8627883
Binary files /dev/null and b/img/nonperfect.png differ
diff --git a/img/specular.png b/img/specular.png
new file mode 100644
index 0000000..8ad19ff
Binary files /dev/null and b/img/specular.png differ
diff --git a/scenes/cornell.txt b/scenes/cornell.txt
index 5f7b59b..9a1bc1f 100644
--- a/scenes/cornell.txt
+++ b/scenes/cornell.txt
@@ -111,7 +111,7 @@ SCALE       .01 10 10
 // Sphere
 OBJECT 6
 sphere
-material 4
+material 1
 TRANS       -1 4 -1
 ROTAT       0 0 0
 SCALE       3 3 3
diff --git a/scenes/cornell_blur.txt b/scenes/cornell_blur.txt
new file mode 100644
index 0000000..08e223b
--- /dev/null
+++ b/scenes/cornell_blur.txt
@@ -0,0 +1,130 @@
+// Emissive material (light)
+MATERIAL 0
+RGB         1 1 1
+SPECEX      0
+SPECRGB     0 0 0
+REFL        0
+REFR        0
+REFRIOR     0
+EMITTANCE   5
+
+// Diffuse white
+MATERIAL 1
+RGB         .98 .98 .98
+SPECEX      0
+SPECRGB     0 0 0
+REFL        0
+REFR        0
+REFRIOR     0
+EMITTANCE   0
+
+// Diffuse red
+MATERIAL 2
+RGB         .85 .35 .35
+SPECEX      0
+SPECRGB     0 0 0
+REFL        0
+REFR        0
+REFRIOR     0
+EMITTANCE   0
+
+// Diffuse green
+MATERIAL 3
+RGB         .35 .85 .35
+SPECEX      0
+SPECRGB     0 0 0
+REFL        0
+REFR        0
+REFRIOR     0
+EMITTANCE   0
+
+// Specular white
+MATERIAL 4
+RGB         .98 .98 .98
+SPECEX      0
+SPECRGB     .98 .98 .98
+REFL        1
+REFR        0
+REFRIOR     0
+EMITTANCE   0
+
+// Diffuse blue
+MATERIAL 5
+RGB         .35 .35 .85
+SPECEX      0
+SPECRGB     0 0 0
+REFL        0
+REFR        0
+REFRIOR     0
+EMITTANCE   0
+
+// Camera
+CAMERA
+RES         800 800
+FOVY        45
+ITERATIONS  5000
+DEPTH       8
+FILE        cornell
+EYE         0.0 5 10.5
+VIEW        0 0 -1
+UP          0 1 0
+
+
+// Ceiling light
+OBJECT 0
+cube
+material 0
+TRANS       0 10 0
+ROTAT       0 0 0
+SCALE       3 .3 3
+
+// Floor
+OBJECT 1
+cube
+material 1
+TRANS       0 0 0
+ROTAT       0 0 0
+SCALE       10 .01 10
+
+// Ceiling
+OBJECT 2
+cube
+material 1
+TRANS       0 10 0
+ROTAT       0 0 90
+SCALE       .01 10 10
+
+// Back wall
+OBJECT 3
+cube
+material 1
+TRANS       0 5 -5
+ROTAT       0 90 0
+SCALE       .01 10 10
+
+// Left wall
+OBJECT 4
+cube
+material 2
+TRANS       -5 5 0
+ROTAT       0 0 0
+SCALE       .01 10 10
+
+// Right wall
+OBJECT 5
+cube
+material 3
+TRANS       5 5 0
+ROTAT       0 0 0
+SCALE       .01 10 10
+
+// Sphere
+OBJECT 6
+sphere
+material 5
+TRANS       -1 4 -1
+ROTAT       0 0 0
+SCALE       3 3 3
+MOVING
+TRANSGOAL   -1 2 -1
+FRAMES       10
diff --git a/scenes/cornell_closed.txt b/scenes/cornell_closed.txt
new file mode 100644
index 0000000..c35b18d
--- /dev/null
+++ b/scenes/cornell_closed.txt
@@ -0,0 +1,125 @@
+// Emissive material (light)
+MATERIAL 0
+RGB         1 1 1
+SPECEX      0
+SPECRGB     0 0 0
+REFL        0
+REFR        0
+REFRIOR     0
+EMITTANCE   5
+
+// Diffuse white
+MATERIAL 1
+RGB         .98 .98 .98
+SPECEX      0
+SPECRGB     0 0 0
+REFL        0
+REFR        0
+REFRIOR     0
+EMITTANCE   0
+
+// Diffuse red
+MATERIAL 2
+RGB         .85 .35 .35
+SPECEX      0
+SPECRGB     0 0 0
+REFL        0
+REFR        0
+REFRIOR     0
+EMITTANCE   0
+
+// Diffuse green
+MATERIAL 3
+RGB         .35 .85 .35
+SPECEX      0
+SPECRGB     0 0 0
+REFL        0
+REFR        0
+REFRIOR     0
+EMITTANCE   0
+
+// Specular white
+MATERIAL 4
+RGB         .98 .98 .98
+SPECEX      0
+SPECRGB     .98 .98 .98
+REFL        1
+REFR        0
+REFRIOR     0
+EMITTANCE   0
+
+// Camera
+CAMERA
+RES         800 800
+FOVY        45
+ITERATIONS  5000
+DEPTH       8
+FILE        cornell
+EYE         0.0 5 10.5
+VIEW        0 0 -1
+UP          0 1 0
+
+
+// Ceiling light
+OBJECT 0
+cube
+material 0
+TRANS       0 10 0
+ROTAT       0 0 0
+SCALE       3 .3 3
+
+// Floor
+OBJECT 1
+cube
+material 1
+TRANS       0 0 0
+ROTAT       0 0 0
+SCALE       10 .01 10
+
+// Ceiling
+OBJECT 2
+cube
+material 1
+TRANS       0 10 0
+ROTAT       0 0 90
+SCALE       .01 10 10
+
+// Back wall
+OBJECT 3
+cube
+material 1
+TRANS       0 5 -5
+ROTAT       0 90 0
+SCALE       .01 10 10
+
+// Left wall
+OBJECT 4
+cube
+material 2
+TRANS       -5 5 0
+ROTAT       0 0 0
+SCALE       .01 10 10
+
+// Right wall
+OBJECT 5
+cube
+material 3
+TRANS       5 5 0
+ROTAT       0 0 0
+SCALE       .01 10 10
+
+// Front wall
+OBJECT 6
+cube
+material 1
+TRANS       0 5 5
+ROTAT       0 90 0
+SCALE       .01 10 10
+
+// Sphere
+OBJECT 7
+sphere
+material 1
+TRANS       -1 4 -1
+ROTAT       0 0 0
+SCALE       3 3 3
diff --git a/scenes/cornell_nonperfect.txt b/scenes/cornell_nonperfect.txt
new file mode 100644
index 0000000..f4713f7
--- /dev/null
+++ b/scenes/cornell_nonperfect.txt
@@ -0,0 +1,127 @@
+// Emissive material (light)
+MATERIAL 0
+RGB         1 1 1
+SPECEX      0
+SPECRGB     0 0 0
+REFL        0
+REFR        0
+REFRIOR     0
+EMITTANCE   5
+
+// Diffuse white
+MATERIAL 1
+RGB         .98 .98 .98
+SPECEX      0
+SPECRGB     0 0 0
+REFL        0
+REFR        0
+REFRIOR     0
+EMITTANCE   0
+
+// Diffuse red
+MATERIAL 2
+RGB         .85 .35 .35
+SPECEX      0
+SPECRGB     0 0 0
+REFL        0
+REFR        0
+REFRIOR     0
+EMITTANCE   0
+
+// Diffuse green
+MATERIAL 3
+RGB         .35 .85 .35
+SPECEX      0
+SPECRGB     0 0 0
+REFL        0
+REFR        0
+REFRIOR     0
+EMITTANCE   0
+
+// Specular white
+MATERIAL 4
+RGB         .98 .98 .98
+SPECEX      0
+SPECRGB     .98 .98 .98
+REFL        1
+REFR        0
+REFRIOR     0
+EMITTANCE   0
+
+// Diffuse blue
+MATERIAL 5
+RGB         .35 .35 .85
+SPECEX      0.5
+SPECRGB     .35 .35 .85
+REFL        1
+REFR        0
+REFRIOR     0
+EMITTANCE   0
+
+// Camera
+CAMERA
+RES         800 800
+FOVY        45
+ITERATIONS  5000
+DEPTH       8
+FILE        cornell
+EYE         0.0 5 10.5
+VIEW        0 0 -1
+UP          0 1 0
+
+
+// Ceiling light
+OBJECT 0
+cube
+material 0
+TRANS       0 10 0
+ROTAT       0 0 0
+SCALE       3 .3 3
+
+// Floor
+OBJECT 1
+cube
+material 1
+TRANS       0 0 0
+ROTAT       0 0 0
+SCALE       10 .01 10
+
+// Ceiling
+OBJECT 2
+cube
+material 1
+TRANS       0 10 0
+ROTAT       0 0 90
+SCALE       .01 10 10
+
+// Back wall
+OBJECT 3
+cube
+material 1
+TRANS       0 5 -5
+ROTAT       0 90 0
+SCALE       .01 10 10
+
+// Left wall
+OBJECT 4
+cube
+material 2
+TRANS       -5 5 0
+ROTAT       0 0 0
+SCALE       .01 10 10
+
+// Right wall
+OBJECT 5
+cube
+material 3
+TRANS       5 5 0
+ROTAT       0 0 0
+SCALE       .01 10 10
+
+// Sphere
+OBJECT 6
+sphere
+material 5
+TRANS       -1 4 -1
+ROTAT       0 0 0
+SCALE       3 3 3
diff --git a/src/interactions.h b/src/interactions.h
index d8107fb..2940ea6 100644
--- a/src/interactions.h
+++ b/src/interactions.h
@@ -50,7 +50,7 @@ glm::vec3 calculateRandomDirectionInHemisphere(
  * 
  * The visual effect you want is to straight-up add the diffuse and specular
  * components. You can do this in a few ways. This logic also applies to
- * combining other types of materias (such as refractive).
+ * combining other types of materials (such as refractive).
  * 
  * - Always take an even (50/50) split between a each effect (a diffuse bounce
  *   and a specular bounce), but divide the resulting color of either branch
@@ -74,7 +74,31 @@ void scatterRay(
         glm::vec3 normal,
         const Material &m,
         thrust::default_random_engine &rng) {
-    // TODO: implement this.
-    // A basic implementation of pure-diffuse shading will just call the
-    // calculateRandomDirectionInHemisphere defined above.
+
+	thrust::uniform_real_distribution<float> u01(0, 1);
+
+	if(m.hasReflective){
+		if(m.specular.exponent == 0) {
+			// perfect-reflective
+			ray.origin = intersect + normal * EPSILON;
+			ray.direction = glm::reflect(ray.direction, normal);
+			color *= m.specular.color;
+		} else {
+			if( u01(rng)<0.5 ) {
+				ray.origin = intersect + normal * EPSILON;
+				ray.direction = glm::reflect(ray.direction, normal);
+				color *= m.specular.color;
+			} else {
+				ray.origin = intersect + normal * EPSILON;
+				ray.direction = calculateRandomDirectionInHemisphere(normal, rng);
+				color *= m.color;
+			}
+		}
+	} else {
+		// pure-diffuse
+		ray.origin = intersect + normal * EPSILON;
+		ray.direction = calculateRandomDirectionInHemisphere(normal, rng);
+		color *= m.color;
+	}
+
 }
diff --git a/src/intersections.h b/src/intersections.h
index f34b89d..c071e43 100644
--- a/src/intersections.h
+++ b/src/intersections.h
@@ -110,7 +110,7 @@ __host__ __device__ float sphereIntersectionTest(Geom sphere, Ray r,
     rt.direction = rd;
 
     float vDotDirection = glm::dot(rt.origin, rt.direction);
-    float radicand = vDotDirection * vDotDirection - (glm::dot(rt.origin, rt.origin) - pow(radius, 2));
+    float radicand = vDotDirection * vDotDirection - (glm::dot(rt.origin, rt.origin) - powf(radius, 2));
     if (radicand < 0) {
         return -1;
     }
diff --git a/src/main.cpp b/src/main.cpp
index 77671f4..c10c2b8 100644
--- a/src/main.cpp
+++ b/src/main.cpp
@@ -27,6 +27,12 @@ int main(int argc, char** argv) {
     }
 
     const char *sceneFile = argv[1];
+    //const char *sceneFile = "scenes/cornell.txt";
+    //const char *sceneFile = "scenes/cornell_nonperfect.txt";
+    //const char *sceneFile = "scenes/cornell_blur.txt";
+    //const char *sceneFile = "scenes/cornell_closed.txt";
+    //printf(argv[0]);
+    //printf(argv[1]);
 
     // Load scene file
     scene = new Scene(sceneFile);
@@ -99,8 +105,14 @@ void runCuda() {
         cudaGLMapBufferObject((void**)&pbo_dptr, pbo);
 
         // execute the kernel
-        int frame = 0;
-        pathtrace(pbo_dptr, frame, iteration);
+        if( scene->blur ) {
+        	for( int frame = 0; frame<(scene->frames); frame++ ){
+        		pathtrace(pbo_dptr, frame, scene->frames, iteration);
+        		//printf("frame is : %d", frame);
+        	}
+        } else {
+        	pathtrace(pbo_dptr, 0, 0, iteration);
+        }
 
         // unmap buffer object
         cudaGLUnmapBufferObject(pbo);
diff --git a/src/pathtrace.cu b/src/pathtrace.cu
index e7ef1c6..9ff363f 100644
--- a/src/pathtrace.cu
+++ b/src/pathtrace.cu
@@ -9,10 +9,12 @@
 #include "scene.h"
 #include "glm/glm.hpp"
 #include "glm/gtx/norm.hpp"
+#include <glm/gtc/matrix_inverse.hpp>
 #include "utilities.h"
 #include "pathtrace.h"
 #include "intersections.h"
 #include "interactions.h"
+#include "../stream_compaction/efficient.h"
 
 #define ERRORCHECK 1
 
@@ -69,8 +71,15 @@ __global__ void sendImageToPBO(uchar4* pbo, glm::ivec2 resolution,
 
 static Scene *hst_scene = NULL;
 static glm::vec3 *dev_image = NULL;
-// TODO: static variables for device memory, scene/camera info, etc
-// ...
+
+static Ray* dev_rays = NULL;
+static Ray* dev_raysNew = NULL;
+static Geom* dev_geoms = NULL;
+static Material* dev_materials = NULL;
+
+// Antialiasing
+static int sampleTimes = 1;
+static glm::vec3 *dev_image_antialias = NULL;
 
 void pathtraceInit(Scene *scene) {
     hst_scene = scene;
@@ -79,48 +88,129 @@ void pathtraceInit(Scene *scene) {
 
     cudaMalloc(&dev_image, pixelcount * sizeof(glm::vec3));
     cudaMemset(dev_image, 0, pixelcount * sizeof(glm::vec3));
-    // TODO: initialize the above static variables added above
+
+    cudaMalloc(&dev_rays, pixelcount * sizeof(Ray));
+    cudaMemset(dev_rays, 0, pixelcount * sizeof(Ray));
+
+	cudaMalloc(&dev_raysNew, pixelcount * sizeof(Ray));
+	cudaMemset(dev_raysNew, 0, pixelcount * sizeof(Ray));
+
+	cudaMalloc(&dev_geoms, hst_scene->geoms.size() * sizeof(Geom));
+	cudaMemcpy(dev_geoms, &(hst_scene->geoms)[0], hst_scene->geoms.size() * sizeof(Geom), cudaMemcpyHostToDevice);
+
+	cudaMalloc(&dev_materials, pixelcount * sizeof(Material));
+	cudaMemcpy(dev_materials, &(hst_scene->materials)[0], hst_scene->materials.size() * sizeof(Material), cudaMemcpyHostToDevice);
+
+    cudaMalloc(&dev_image_antialias, pixelcount * sizeof(glm::vec3));
+    cudaMemset(dev_image_antialias, 0, pixelcount * sizeof(glm::vec3));
 
     checkCUDAError("pathtraceInit");
 }
 
 void pathtraceFree() {
     cudaFree(dev_image);  // no-op if dev_image is null
-    // TODO: clean up the above static variables
+	cudaFree(dev_rays);
+	cudaFree(dev_raysNew);
+	cudaFree(dev_geoms);
+	cudaFree(dev_materials);
+	cudaFree(dev_image_antialias);
 
     checkCUDAError("pathtraceFree");
 }
 
-/**
- * Example function to generate static and test the CUDA-GL interop.
- * Delete this once you're done looking at it!
- */
-__global__ void generateNoiseDeleteMe(Camera cam, int iter, glm::vec3 *image) {
-    int x = (blockIdx.x * blockDim.x) + threadIdx.x;
-    int y = (blockIdx.y * blockDim.y) + threadIdx.y;
 
-    if (x < cam.resolution.x && y < cam.resolution.y) {
-        int index = x + (y * cam.resolution.x);
+__global__ void initRays(Camera cam, int iter, Ray* rays, int sampleTimes) {
+	int x = (blockIdx.x * blockDim.x) + threadIdx.x;
+	int y = (blockIdx.y * blockDim.y) + threadIdx.y;
+	int index = x + (y * cam.resolution.x);
 
-        thrust::default_random_engine rng = makeSeededRandomEngine(iter, index, 0);
-        thrust::uniform_real_distribution<float> u01(0, 1);
+	if (x < cam.resolution.x && y < cam.resolution.y) {
+		//thrust::default_random_engine rng = makeSeededRandomEngine(iter, index, depth);
+		thrust::default_random_engine rng = makeSeededRandomEngine(iter, index, 0);
+		thrust::uniform_int_distribution<float> u01(0.0f, 1.0f);
+		thrust::uniform_int_distribution<float> u04(-4.0f, 4.0f);
+		float X, Y;
+		glm::vec3 camRight = glm::cross(cam.up, cam.view);
 
-        // CHECKITOUT: Note that on every iteration, noise gets added onto
-        // the image (not replaced). As a result, the image smooths out over
-        // time, since the output image is the contents of this array divided
-        // by the number of iterations.
-        //
-        // Your renderer will do the same thing, and, over time, it will become
-        // smoother.
-        image[index] += glm::vec3(u01(rng));
-    }
+		if(sampleTimes == 1){
+			X = (-(cam.resolution.x / 2.0f - x ) * sin(cam.fov.x)) / cam.resolution.x * 2;
+			Y = ((cam.resolution.y / 2.0f - y ) * sin(cam.fov.y)) / cam.resolution.y * 2;
+		} else {
+			X = (-(cam.resolution.x / 2.0f - x + u04(rng)) * sin(cam.fov.x)) / cam.resolution.x * 2;
+			Y = ((cam.resolution.y / 2.0f - y + u04(rng)) * sin(cam.fov.y)) / cam.resolution.y * 2;
+		}
+		rays[index].direction = cam.view + X * camRight + Y * cam.up;
+		rays[index].origin = cam.position;
+		rays[index].color = glm::vec3(1.0f);
+		rays[index].imageIndex = index;
+		rays[index].run = true;
+	}
+}
+
+__global__ void computeRays( Ray *rays, const Geom *geoms, const int objNumber) {
+	int index = blockIdx.x * blockDim.x * blockDim.y + threadIdx.y * blockDim.x + threadIdx.x;
+
+	bool outsideFlag;
+	float t = 0.0f;
+	float closestT = 999999999999;
+	glm::vec3 normal, intersectionPoint;
+
+	for (int i = 0; i < objNumber; i++) {
+		if (geoms[i].type == CUBE) {
+			t = boxIntersectionTest(geoms[i], rays[index], intersectionPoint, normal, outsideFlag);
+		}
+		else if (geoms[i].type == SPHERE) {
+			t = sphereIntersectionTest(geoms[i], rays[index], intersectionPoint, normal, outsideFlag);
+		}
+		if ( t > 0 && t < closestT ) {
+			closestT = t;
+			rays[index].hit = true;
+			rays[index].intersectionGeomIndex = i;
+			rays[index].intersectionPoint = intersectionPoint;
+			rays[index].intersectionNormal = normal;
+		}
+	}
+}
+
+__global__ void fillImage(int frame, int frames, int iter, int depth, glm::vec3 *image, Ray *rays, const Geom *geoms, const Material *materials) {
+	int index = blockIdx.x * blockDim.x * blockDim.y + threadIdx.y * blockDim.x + threadIdx.x;
+	int imageIndex = rays[index].imageIndex;
+	thrust::default_random_engine rng = makeSeededRandomEngine(iter, imageIndex, depth);
+
+	if ( !rays[index].hit ) {
+		rays[index].run = false;
+		//???
+		//image[imageIndex] += rays[index].color;
+	}
+	else {
+		int materialIndex = geoms[rays[index].intersectionGeomIndex].materialid;
+		if (materials[materialIndex].emittance) {
+			rays[index].run = false;
+			image[imageIndex] += materials[materialIndex].color * materials[materialIndex].emittance * rays[index].color / (float)(frames + 1);
+			//image[imageIndex] = image[imageIndex] * ((float)frame)/((float)frame+1) + rays[index].color * materials[materialIndex].color * materials[materialIndex].emittance / (float)(frame + 1);
+		}
+		else {
+			scatterRay(rays[index], rays[index].color, rays[index].intersectionPoint, rays[index].intersectionNormal, materials[materialIndex], rng);
+		}
+	}
+}
+
+
+__global__ void averageImage( Camera cam, glm::vec3 *image_anti, glm::vec3 *image, int sampleTimes) {
+	int x = (blockIdx.x * blockDim.x) + threadIdx.x;
+	int y = (blockIdx.y * blockDim.y) + threadIdx.y;
+
+	if (x < cam.resolution.x && y < cam.resolution.y) {
+		int index = x + (y * cam.resolution.x);
+		image[index] = image_anti[index]/(float)sampleTimes;
+	}
 }
 
 /**
  * Wrapper for the __global__ call that sets up the kernel calls and does a ton
  * of memory management
  */
-void pathtrace(uchar4 *pbo, int frame, int iter) {
+void pathtrace(uchar4 *pbo, int frame, int frames, int iter) {
     const int traceDepth = hst_scene->state.traceDepth;
     const Camera &cam = hst_scene->state.camera;
     const int pixelcount = cam.resolution.x * cam.resolution.y;
@@ -157,9 +247,45 @@ void pathtrace(uchar4 *pbo, int frame, int iter) {
     // * Finally, handle all of the paths that still haven't terminated.
     //   (Easy way is to make them black or background-colored.)
 
-    // TODO: perform one iteration of path tracing
+    // calculate the object position according to frame number
+    Geom *geoms = &(hst_scene->geoms)[0];
+    glm::vec3 translationCurrent;
+    bool blur = false;
+    for(int i=0; i<hst_scene->geoms.size(); i++) {
+    	if(geoms[i].moving) {
+    		blur = true;
+    		translationCurrent = geoms[i].translation + (geoms[i].translationGoal - geoms[i].translation) * ((float)frame/(float)frames) ;
+    		geoms[i].transform = utilityCore::buildTransformationMatrix(translationCurrent, geoms[i].rotation, geoms[i].scale);
+    		geoms[i].inverseTransform = glm::inverse(geoms[i].transform);
+    		geoms[i].invTranspose = glm::inverseTranspose(geoms[i].transform);
+    	}
+    }
+    if(blur) {
+    	cudaMemcpy(dev_geoms, &(hst_scene->geoms)[0], hst_scene->geoms.size() * sizeof(Geom), cudaMemcpyHostToDevice);
+    }
+
+    int rayNumber = pixelcount;
+    for(int i=0; i<sampleTimes; i++) {
+    	initRays<<<blocksPerGrid2d, blockSize2d>>>(cam, iter, dev_rays, sampleTimes);
+    	checkCUDAError("initRays");
+
+    	for(int d=0; d<traceDepth; d++) {
+    		dim3 blocksPerGrid = (rayNumber + 64 - 1) / 64;
 
-    generateNoiseDeleteMe<<<blocksPerGrid2d, blockSize2d>>>(cam, iter, dev_image);
+    		computeRays<<<blocksPerGrid, blockSize2d>>>( dev_rays, dev_geoms, hst_scene->geoms.size());
+    		checkCUDAError("computeRays");
+
+    		fillImage<<<blocksPerGrid, blockSize2d>>>(frame, frames, iter, d, dev_image_antialias, dev_rays, dev_geoms, dev_materials);
+    		checkCUDAError("fillImage");
+
+//    		if(iter == 50){
+//    			printf("ray number is: %d", rayNumber);
+//    		}
+    		rayNumber = StreamCompaction::Efficient::compact(rayNumber, dev_raysNew, dev_rays);
+    		cudaMemcpy(dev_rays, dev_raysNew, pixelcount * sizeof(Ray), cudaMemcpyDeviceToDevice);
+    	}
+    }
+    averageImage<<<blocksPerGrid2d, blockSize2d>>>(cam, dev_image_antialias, dev_image, sampleTimes);
 
     ///////////////////////////////////////////////////////////////////////////
 
diff --git a/src/pathtrace.h b/src/pathtrace.h
index 1241227..a4accea 100644
--- a/src/pathtrace.h
+++ b/src/pathtrace.h
@@ -5,4 +5,4 @@
 
 void pathtraceInit(Scene *scene);
 void pathtraceFree();
-void pathtrace(uchar4 *pbo, int frame, int iteration);
+void pathtrace(uchar4 *pbo, int frame, int frames, int iteration);
diff --git a/src/scene.cpp b/src/scene.cpp
index 5804ce3..c2a45e0 100644
--- a/src/scene.cpp
+++ b/src/scene.cpp
@@ -34,6 +34,7 @@ Scene::Scene(string filename) {
 
 int Scene::loadGeom(string objectid) {
     int id = atoi(objectid.c_str());
+    //bool &blur = this->blur;
     if (id != geoms.size()) {
         cout << "ERROR: OBJECT ID does not match expected number of geoms" << endl;
         return -1;
@@ -74,6 +75,14 @@ int Scene::loadGeom(string objectid) {
                 newGeom.rotation = glm::vec3(atof(tokens[1].c_str()), atof(tokens[2].c_str()), atof(tokens[3].c_str()));
             } else if (strcmp(tokens[0].c_str(), "SCALE") == 0) {
                 newGeom.scale = glm::vec3(atof(tokens[1].c_str()), atof(tokens[2].c_str()), atof(tokens[3].c_str()));
+            } else if( strcmp(tokens[0].c_str(), "MOVING") == 0 ) {
+            	newGeom.moving = true;
+            	this->blur = true;
+            } else if( strcmp(tokens[0].c_str(), "FRAMES") == 0 ) {
+            	newGeom.frames = atof(tokens[1].c_str());
+            	this->frames = atof(tokens[1].c_str());
+            } else if(strcmp(tokens[0].c_str(), "TRANSGOAL") == 0) {
+                newGeom.translationGoal = glm::vec3(atof(tokens[1].c_str()), atof(tokens[2].c_str()), atof(tokens[3].c_str()));
             }
 
             utilityCore::safeGetline(fp_in, line);
diff --git a/src/scene.h b/src/scene.h
index f29a917..780645b 100644
--- a/src/scene.h
+++ b/src/scene.h
@@ -23,4 +23,6 @@ class Scene {
     std::vector<Geom> geoms;
     std::vector<Material> materials;
     RenderState state;
+    bool blur;
+    int frames;
 };
diff --git a/src/sceneStructs.h b/src/sceneStructs.h
index baa2e30..2eb10d0 100644
--- a/src/sceneStructs.h
+++ b/src/sceneStructs.h
@@ -13,6 +13,13 @@ enum GeomType {
 struct Ray {
     glm::vec3 origin;
     glm::vec3 direction;
+	glm::vec3 color;
+	int imageIndex;
+	bool run;
+	bool hit;
+	glm::vec3 intersectionPoint;
+	glm::vec3 intersectionNormal;
+	int intersectionGeomIndex;
 };
 
 struct Geom {
@@ -24,6 +31,9 @@ struct Geom {
     glm::mat4 transform;
     glm::mat4 inverseTransform;
     glm::mat4 invTranspose;
+    bool moving;
+    glm::vec3 translationGoal;
+    int frames;
 };
 
 struct Material {
diff --git a/stream_compaction/CMakeLists.txt b/stream_compaction/CMakeLists.txt
index ac358c9..9434d18 100644
--- a/stream_compaction/CMakeLists.txt
+++ b/stream_compaction/CMakeLists.txt
@@ -1,4 +1,8 @@
 set(SOURCE_FILES
+    "common.h"
+    "common.cu"
+    "efficient.h"
+    "efficient.cu"
     )
 
 cuda_add_library(stream_compaction
diff --git a/stream_compaction/common.cu b/stream_compaction/common.cu
new file mode 100644
index 0000000..4b1031c
--- /dev/null
+++ b/stream_compaction/common.cu
@@ -0,0 +1,53 @@
+#include "common.h"
+
+//void checkCUDAErrorFn(const char *msg, const char *file, int line) {
+//    cudaError_t err = cudaGetLastError();
+//    if (cudaSuccess == err) {
+//        return;
+//    }
+//
+//    fprintf(stderr, "CUDA error");
+//    if (file) {
+//        fprintf(stderr, " (%s:%d)", file, line);
+//    }
+//    fprintf(stderr, ": %s: %s\n", msg, cudaGetErrorString(err));
+//    exit(EXIT_FAILURE);
+//}
+
+
+namespace StreamCompaction {
+namespace Common {
+
+/**
+ * Maps an array to an array of 0s and 1s for stream compaction. Elements
+ * which map to 0 will be removed, and elements which map to 1 will be kept.
+ */
+__global__ void kernMapToBoolean(int n, int *bools, Ray *idata) {
+	int index = threadIdx.x + (blockIdx.x * blockDim.x);
+
+		if( index<n ) {
+			if( idata[index].run ) {
+				bools[index] = 1;
+			} else {
+				bools[index] = 0;
+			}
+		}
+}
+
+/**
+ * Performs scatter on an array. That is, for each element in idata,
+ * if bools[idx] == 1, it copies idata[idx] to odata[indices[idx]].
+ */
+__global__ void kernScatter(int n, Ray *odata,
+        Ray *idata, const int *bools, const int *indices) {
+	int index = threadIdx.x + (blockIdx.x * blockDim.x);
+
+	if( index<n ) {
+		if( bools[index] == 1 ) {
+			odata[indices[index]] = idata[index];
+		}
+	}
+}
+
+}
+}
diff --git a/stream_compaction/common.h b/stream_compaction/common.h
new file mode 100644
index 0000000..feb6baf
--- /dev/null
+++ b/stream_compaction/common.h
@@ -0,0 +1,35 @@
+#pragma once
+
+#include <cstdio>
+#include <cstring>
+#include <cmath>
+#include "../src/sceneStructs.h"
+
+//#define FILENAME (strrchr(__FILE__, '/') ? strrchr(__FILE__, '/') + 1 : __FILE__)
+//#define checkCUDAError(msg) checkCUDAErrorFn(msg, FILENAME, __LINE__)
+
+/**
+ * Check for CUDA errors; print and exit if there was a problem.
+ */
+void checkCUDAErrorFn(const char *msg, const char *file = NULL, int line = -1);
+
+inline int ilog2(int x) {
+    int lg = 0;
+    while (x >>= 1) {
+        ++lg;
+    }
+    return lg;
+}
+
+inline int ilog2ceil(int x) {
+    return ilog2(x - 1) + 1;
+}
+
+
+namespace StreamCompaction {
+namespace Common {
+    __global__ void kernMapToBoolean(int n, int *bools, Ray *idata);
+
+    __global__ void kernScatter(int n, Ray *odata, Ray *idata, const int *bools, const int *indices);
+}
+}
diff --git a/stream_compaction/efficient.cu b/stream_compaction/efficient.cu
new file mode 100644
index 0000000..50fcad0
--- /dev/null
+++ b/stream_compaction/efficient.cu
@@ -0,0 +1,151 @@
+#include <cuda.h>
+#include <cuda_runtime.h>
+#include "common.h"
+#include "efficient.h"
+
+#define blockSize 256
+#define blockSizeHalf 128
+
+namespace StreamCompaction {
+namespace Efficient {
+
+//code from http://http.developer.nvidia.com/GPUGems3/gpugems3_ch39.html
+__global__ void prescan(int n, int *odata, const int *idata) {
+	extern __shared__ int temp[];
+	int tid = threadIdx.x;
+	int index = blockIdx.x * blockDim.x + threadIdx.x;
+	int offset = 1;
+
+	temp[2 * tid] = idata[2 * index ];
+	temp[2 * tid + 1] = idata[2 * index + 1 ];
+
+	for (int d = n >> 1; d > 0; d >>= 1) {
+		__syncthreads();
+		if (tid < d) {
+			int ai = offset * (2 * tid + 1) - 1;
+			int bi = offset * (2 * tid + 2) - 1;
+			temp[bi] += temp[ai];
+		}
+		offset *= 2;
+	}
+
+	if (tid == 0) {
+		temp[n - 1] = 0;
+	}
+
+	for (int d = 1; d < n; d *= 2) {
+		offset >>= 1;
+		__syncthreads();
+		if (tid < d) {
+			int ai = offset * (2 * tid + 1) - 1;
+			int bi = offset * (2 * tid + 2) - 1;
+			int t = temp[ai];
+			temp[ai] = temp[bi];
+			temp[bi] += t;
+		}
+	}
+	__syncthreads();
+
+	odata[2 * index ] = temp[2 * tid];
+	odata[2 * index + 1 ] = temp[2 * tid + 1];
+}
+
+__global__ void sumEachBlock(int n, int *datasum, int *idata, int *odata) {
+	int index = (blockIdx.x * blockDim.x) + threadIdx.x;
+	if( index < n ) {
+		datasum[index] = idata[(index + 1) * blockSize - 1] + odata[(index + 1) * blockSize - 1];
+	}
+}
+
+__global__ void addIncrements(int n, int *data, int *increments) {
+	int index = (blockIdx.x * blockDim.x) + threadIdx.x;
+	if( index < n ) {
+		data[index] = data[index] + increments[blockIdx.x];
+	}
+}
+
+// scan on multiple blocks, algorithm from CIS565 lecture slides
+void scan(int n, int *odata, int *idata) {
+
+	int blocksPerGrid = (n + blockSize - 1) / blockSize;
+	int n_new = blocksPerGrid * blockSize;
+	int *dev_idata;
+	int *dev_odata = odata;
+
+	cudaMalloc((void**)&dev_idata, n_new * sizeof(int));
+	cudaMemset(dev_idata, 0, n_new * sizeof(int));
+	cudaMemcpy(dev_idata, idata, n * sizeof(int), cudaMemcpyDeviceToDevice);
+
+	//prescan<<<blocksPerGrid, blockSize>>>(blockSize, dev_odata, dev_idata);
+	prescan<<<blocksPerGrid, blockSizeHalf, blockSize * sizeof(int)>>>(blockSize, dev_odata, dev_idata);
+
+	if( blocksPerGrid > 1) {
+		int *dev_sum, *dev_sum_scan;
+		cudaMalloc((void**)&dev_sum, blocksPerGrid * sizeof(int));
+		cudaMalloc((void**)&dev_sum_scan, blocksPerGrid * sizeof(int));
+
+		int blocksPerGrid_new = (blocksPerGrid + blockSize - 1) / blockSize;
+		sumEachBlock<<<blocksPerGrid_new, blockSize>>>(blocksPerGrid, dev_sum, dev_odata, dev_idata);
+		scan(blocksPerGrid, dev_sum_scan, dev_sum);
+		addIncrements<<<blocksPerGrid, blockSize>>>(n_new, dev_odata, dev_sum_scan);
+
+		cudaFree(dev_sum);
+		cudaFree(dev_sum_scan);
+	}
+	cudaFree(dev_idata);
+}
+
+
+/**
+ * Performs stream compaction on idata, storing the result into odata.
+ * All zeroes are discarded.
+ *
+ * @param n      The number of elements in idata.
+ * @param odata  The array into which to store elements.
+ * @param idata  The array of elements to compact.
+ * @returns      The number of elements remaining after compaction.
+ */
+int compact(int n, Ray *odata, Ray *idata) {
+
+	Ray *dev_idata = idata;
+	Ray *dev_odata = odata;
+	int *dev_bools;
+	int *dev_indices;
+
+	int hst_bools[n];
+	int hst_indices[n];
+
+	dim3 fullBlocksPerGrid((n + blockSize - 1) / blockSize);
+
+    //cudaMalloc((void**)&dev_idata, n * sizeof(Ray));
+    //cudaMalloc((void**)&dev_odata, n * sizeof(Ray));
+    cudaMalloc((void**)&dev_bools, n * sizeof(int));
+    cudaMalloc((void**)&dev_indices, n * sizeof(int));
+    cudaMemset(dev_indices, 0, n * sizeof(int));
+
+    //cudaMemcpy(dev_idata, idata, n * sizeof(int), cudaMemcpyHostToDevice);
+    Common::kernMapToBoolean<<<fullBlocksPerGrid, blockSize>>>(n, dev_bools, dev_idata);
+    cudaMemcpy(hst_bools, dev_bools, n * sizeof(int), cudaMemcpyDeviceToHost);
+
+	scan(n, dev_indices, dev_bools);
+	//scan(n, hst_indices, hst_bools);
+	//printf("n is %d \n", n);
+	cudaMemcpy(hst_indices, dev_indices, n * sizeof(int), cudaMemcpyDeviceToHost);
+
+	Common::kernScatter<<<fullBlocksPerGrid, blockSize>>>(n, dev_odata, dev_idata, dev_bools, dev_indices);
+
+	//cudaFree(dev_idata);
+	//cudaFree(dev_odata);
+	cudaFree(dev_bools);
+	cudaFree(dev_indices);
+
+	if(hst_bools[n-1] == 0) {
+		return hst_indices[n-1];
+	} else {
+		return hst_indices[n-1] + 1;
+	}
+	//return n;
+}
+
+}
+}
diff --git a/stream_compaction/efficient.h b/stream_compaction/efficient.h
new file mode 100644
index 0000000..5c36430
--- /dev/null
+++ b/stream_compaction/efficient.h
@@ -0,0 +1,9 @@
+#pragma once
+#include "../src/sceneStructs.h"
+
+namespace StreamCompaction {
+namespace Efficient {
+    void scan(int n, int *odata, const int *idata);
+    int compact(int n, Ray *odata, Ray *idata);
+}
+}