From b81624929d69f602d3eda7fc8a7a5a29e1530e6b Mon Sep 17 00:00:00 2001 From: Connor Fitzgerald Date: Sun, 5 Jan 2025 02:40:44 -0500 Subject: [PATCH] Standalone Compute Fix Generate Fix? 17th Time's The Charm Fix Build See Eye Docs Victory Tabular Formatting Formating3 --- .deny.toml | 1 + .github/workflows/generate.yml | 60 +++++ Cargo.lock | 20 +- Cargo.toml | 7 +- README.md | 13 +- examples/README.md | 91 ++----- examples/features/src/hello_compute/README.md | 22 -- examples/features/src/hello_compute/mod.rs | 193 ------------- .../features/src/hello_compute/shader.wgsl | 38 --- examples/features/src/hello_compute/tests.rs | 106 -------- examples/features/src/lib.rs | 2 - examples/features/src/main.rs | 12 - .../standalone/1_hello_compute/Cargo.toml | 11 + .../1_hello_compute/cargo-generate.toml | 1 + .../standalone/1_hello_compute/src/main.rs | 255 ++++++++++++++++++ .../1_hello_compute/src/shader.wgsl | 27 ++ 16 files changed, 404 insertions(+), 455 deletions(-) create mode 100644 .github/workflows/generate.yml delete mode 100644 examples/features/src/hello_compute/README.md delete mode 100644 examples/features/src/hello_compute/mod.rs delete mode 100644 examples/features/src/hello_compute/shader.wgsl delete mode 100644 examples/features/src/hello_compute/tests.rs create mode 100644 examples/standalone/1_hello_compute/Cargo.toml create mode 100644 examples/standalone/1_hello_compute/cargo-generate.toml create mode 100644 examples/standalone/1_hello_compute/src/main.rs create mode 100644 examples/standalone/1_hello_compute/src/shader.wgsl diff --git a/.deny.toml b/.deny.toml index 1976f709c9..f2777f35c4 100644 --- a/.deny.toml +++ b/.deny.toml @@ -36,6 +36,7 @@ allow = [ "Unicode-3.0", "Zlib", ] +private = { ignore = true } [sources] allow-git = [ diff --git a/.github/workflows/generate.yml b/.github/workflows/generate.yml new file mode 100644 index 0000000000..495f39bc5d --- /dev/null +++ b/.github/workflows/generate.yml @@ -0,0 +1,60 @@ +name: cargo-generate + +on: + push: + branches: ["*"] + tags: [v0.*] + pull_request: + merge_group: + +env: + # + # Dependency versioning + # + + # This is the MSRV used by `wgpu` itself and all surrounding infrastructure. + REPO_MSRV: "1.83" + RUSTFLAGS: -D warnings + +jobs: + cargo-generate: + timeout-minutes: 5 + + runs-on: ubuntu-latest + + strategy: + fail-fast: false + matrix: + include: + - name: "hello-compute" + path: "examples/standalone/1_hello_compute" + + name: "${{ matrix.name }}" + + steps: + - uses: actions/checkout@v2 + + - name: Install Repo MSRV toolchain + run: | + rustup toolchain install ${{ env.REPO_MSRV }} --no-self-update --profile=minimal + rustup override set ${{ env.REPO_MSRV }} + cargo -V + + - name: "Install cargo-generate" + uses: taiki-e/install-action@v2 + with: + tool: cargo-generate + + - name: "Run cargo-generate" + run: | + cd .. + cargo generate --path wgpu --name ${{ matrix.name }} ${{ matrix.path }} + + - name: "Check generated files" + run: | + cd ../${{ matrix.name }}/ + cat <> Cargo.toml + [patch.crates-io] + wgpu = { path = "../wgpu/wgpu" } + EOF + cargo check diff --git a/Cargo.lock b/Cargo.lock index 5e79f1c44d..abe8e79cfd 100644 --- a/Cargo.lock +++ b/Cargo.lock @@ -1152,7 +1152,7 @@ source = "registry+https://github.com/rust-lang/crates.io-index" checksum = "33d852cb9b869c2a9b3df2f71a3074817f01e1844f839a144f5fcef059a4eb5d" dependencies = [ "libc", - "windows-sys 0.52.0", + "windows-sys 0.59.0", ] [[package]] @@ -1974,7 +1974,7 @@ source = "registry+https://github.com/rust-lang/crates.io-index" checksum = "fc2f4eb4bc735547cfed7c0a4922cbd04a4655978c09b54f1f7b228750664c34" dependencies = [ "cfg-if", - "windows-targets 0.48.5", + "windows-targets 0.52.6", ] [[package]] @@ -2942,7 +2942,7 @@ dependencies = [ "errno", "libc", "linux-raw-sys", - "windows-sys 0.52.0", + "windows-sys 0.59.0", ] [[package]] @@ -3566,7 +3566,7 @@ source = "registry+https://github.com/rust-lang/crates.io-index" checksum = "69fff37da548239c3bf9e64a12193d261e8b22b660991c6fd2df057c168f435f" dependencies = [ "cc", - "windows-targets 0.48.5", + "windows-targets 0.52.6", ] [[package]] @@ -4149,6 +4149,16 @@ dependencies = [ "winit", ] +[[package]] +name = "wgpu-hello-compute" +version = "0.0.0" +dependencies = [ + "bytemuck", + "env_logger", + "pollster", + "wgpu", +] + [[package]] name = "wgpu-info" version = "24.0.0" @@ -4267,7 +4277,7 @@ version = "0.1.9" source = "registry+https://github.com/rust-lang/crates.io-index" checksum = "cf221c93e13a30d793f7645a0e7762c55d169dbb0a49671918a2319d289b10bb" dependencies = [ - "windows-sys 0.48.0", + "windows-sys 0.59.0", ] [[package]] diff --git a/Cargo.toml b/Cargo.toml index ff504818d8..db54a01de0 100644 --- a/Cargo.toml +++ b/Cargo.toml @@ -7,7 +7,7 @@ members = [ # default members "benches", "examples/features", - # "examples/standalone/*", + "examples/standalone/*", "lock-analyzer", "naga-cli", "naga", @@ -26,7 +26,7 @@ exclude = [] default-members = [ "benches", "examples/features", - # "examples/standalone/*", + "examples/standalone/*", "lock-analyzer", "naga-cli", "naga", @@ -219,6 +219,9 @@ ndk-sys = "0.5.0" #js-sys = { path = "../wasm-bindgen/crates/js-sys" } #wasm-bindgen = { path = "../wasm-bindgen" } +# These overrides allow our examples to explicitly depend on release crates +wgpu = { path = "./wgpu" } + [profile.release] lto = "thin" debug = true diff --git a/README.md b/README.md index 55b1a415b2..2a4ce0593b 100644 --- a/README.md +++ b/README.md @@ -12,6 +12,13 @@ The API is based on the [WebGPU standard](https://gpuweb.github.io/gpuweb/). It serves as the core of the WebGPU integration in Firefox, Servo, and Deno. +## Quick Links + +| Docs | Examples | Changelog | +|:------------------------------------:|:--------------------------------------------------------------------:|:----------------------------------------------------------------------------:| +| [v24](https://docs.rs/wgpu/) | [v24](https://github.com/gfx-rs/wgpu/tree/v24/examples#readme) | [v24](https://github.com/gfx-rs/wgpu/releases) | +| [`trunk`](https://wgpu.rs/doc/wgpu/) | [`trunk`](https://github.com/gfx-rs/wgpu/tree/trunk/examples#readme) | [`trunk`](https://github.com/gfx-rs/wgpu/blob/trunk/CHANGELOG.md#unreleased) | + ## Repo Overview The repository hosts the following libraries: @@ -42,14 +49,12 @@ Go to [https://wgpu.rs/examples/] to play with our examples in your browser. Req Rust examples can be found at [wgpu/examples](examples). You can run the examples on native with `cargo run --bin wgpu-examples `. See the [list of examples](examples). +If you are new to wgpu and graphics programming, we recommend starting with https://sotrh.github.io/learn-wgpu/. + To run the examples in a browser, run `cargo xtask run-wasm`. Then open `http://localhost:8000` in your browser, and you can choose an example to run. Naturally, in order to display any of the WebGPU based examples, you need to make sure your browser supports it. -If you are looking for a wgpu tutorial, look at the following: - -- https://sotrh.github.io/learn-wgpu/ - ### C/C++ To use wgpu in C/C++, you need [wgpu-native](https://github.com/gfx-rs/wgpu-native). diff --git a/examples/README.md b/examples/README.md index 799c8a8d5b..d00191c847 100644 --- a/examples/README.md +++ b/examples/README.md @@ -1,20 +1,30 @@ -## Structure +> [!NOTE] +> These are the examples for the development version of wgpu. If you want to see the examples for the latest crates.io release +> of wgpu, go to the [latest release branch](https://github.com/gfx-rs/wgpu/tree/v24/examples#readme). -For the simplest examples without using any helping code (see `framework.rs` here), check out: +# Examples -- `hello` for printing adapter information -- `hello_triangle` for graphics and presentation -- `hello_compute` for pure computing +If you are just starting your graphics programming journey entirely, we recommend going through [Learn-WGPU](https://sotrh.github.io/learn-wgpu/) +for a mode guided tutorial, which will also teach you the basics of graphics programming. -### Summary of examples +## Standalone Examples -A summary of the basic examples as split along the graphics and compute "pathways" laid out roughly in order of building on each other. Those further indented, and thus more roughly dependent on more other examples, tend to be more complicated as well as those further down. It should be noted, though, that computing examples, even though they are mentioned further down (because rendering to a window is by far the most common use case), tend to be less complex as they require less surrounding context to create and manage a window to render to. +All the standalone examples are separate crates and include all boilerplate inside the example itself. They can +be cloned out of the repository to serve as a starting point for your own projects and are fully commented. -The rest of the examples are for demonstrating specific features that you can come back for later when you know what those features are. +| Name | Description | Platforms | +|--------|-------------|-----------| +| [hello compute](standalone/1_hello_compute/) | Simplest example and shows how to run a compute shader on a given set of input data and get the results back. | Native-Only | -#### General +You can also use [`cargo-generate`](https://github.com/cargo-generate/cargo-generate) to easily use these as a basis for your own projects. -- `hello` - Demonstrates the basics of the WGPU library by getting a default Adapter and debugging it to the screen +```sh +cargo generate gfx-rs/wgpu --branch v24 +``` + +## Framework Examples + +These examples use a common framework to handle wgpu init, window creation, and event handling. This allows the example to focus on the unique code in the example itself. Refer to the standalone examples for a more detailed look at the boilerplate code. #### Graphics @@ -44,69 +54,8 @@ The rest of the examples are for demonstrating specific features that you can co - `ray_cube_compute` - Demonstrates using ray queries with a compute shader. - `ray_traced_triangle` - A simpler example demonstrating using ray queries with a compute shader -## Feature matrix - -| Feature | boids | bunnymark | conservative_raster | cube | hello_synchronization | hello_workgroups | mipmap | msaa_line | render_to_texture | repeated_compute | shadow | skybox | stencil_triangles | storage_texture | texture_arrays | uniform_values | water | ray_cube_compute | ray_cube_fragment | ray_scene | ray_shadows | ray_traced_triangle | -|------------------------------| ------ | --------- | ------------------- | ------ | --------------------- | ---------------- | ------ | --------- | ----------------- | ---------------- | ------ | ------ | ----------------- | --------------- | -------------- | -------------- | ------ |------------------|-------------------|-----------|-------------|---------------------| -| vertex attributes | :star: | | | :star: | | | | :star: | | | :star: | :star: | | | :star: | | :star: | | | | | | -| instancing | :star: | | | | | | | | | | | | | | | | | | | | | | -| lines and points | | | :star: | | | | | :star: | | | | | | | | | | | | | | | -| dynamic buffer offsets | | :star: | | | | | | | | | :star: | | | | | | | | | | | | -| implicit layout | | | | | | | :star: | | | | | | | | | | | | | | | | -| sampled color textures | :star: | :star: | :star: | :star: | | | :star: | | | | | :star: | | | :star: | | :star: | | | | | | -| storage textures | :star: | | | | | | | | | | | | | :star: | | | | :star: | | | | :star: | -| comparison samplers | | | | | | | | | | | :star: | | | | | | | | | | | | -| subresource views | | | | | | | :star: | | | | :star: | | | | | | | | | | | | -| cubemaps | | | | | | | | | | | | :star: | | | | | | | | | | | -| multisampling | | | | | | | | :star: | | | | | | | | | | | | | | | -| off-screen rendering | | | :star: | | | | | | :star: | | :star: | | | | | | :star: | | | | | | -| stencil testing | | | | | | | | | | | | | :star: | | | | | | | | | | -| depth testing | | | | | | | | | | | :star: | :star: | | | | | :star: | | | | | | -| depth biasing | | | | | | | | | | | :star: | | | | | | | | | | | | -| read-only depth | | | | | | | | | | | | | | | | | :star: | | | | | | -| blending | | :star: | | :star: | | | | | | | | | | | | | :star: | | | | | | -| render bundles | | | | | | | | :star: | | | | | | | | | :star: | | | | | | -| uniform buffers | | | | | | | | | | | | | | | | :star: | | | | | | | -| compute passes | :star: | | | | :star: | :star: | | | | :star: | | | | :star: | | | | | | | | | -| buffer mapping | | | | | :star: | :star: | | | | :star: | | | | :star: | | | | | | | | | -| error scopes | | | | :star: | | | | | | | | | | | | | | | | | | | -| compute workgroups | | | | | :star: | :star: | | | | | | | | | | | | | | | | | -| compute synchronization | | | | | :star: | | | | | | | | | | | | | | | | | | -| _optional extensions_ | | | | | | | | | | | | | | | :star: | | | | | | | | -| - SPIR-V shaders | | | | | | | | | | | | | | | | | | | | | | | -| - binding array | | | | | | | | | | | | | | | :star: | | | | | | | | -| - push constants | | | | | | | | | | | | | | | | | | | | | :star: | | -| - depth clamping | | | | | | | | | | | :star: | | | | | | | | | | | | -| - compressed textures | | | | | | | | | | | | :star: | | | | | | | | | | | -| - polygon mode | | | | :star: | | | | | | | | | | | | | | | | | | | -| - queries | | | | | | | :star: | | | | | | | | | | | | | | | | -| - conservative rasterization | | | :star: | | | | | | | | | | | | | | | | | | | | -| - ray queries | | | | | | | | | | | | | | | | | | :star: | :star: | :star: | :star: | :star: | -| _integrations_ | | | | | | | | | | | | | | | | | | | | | | | -| - staging belt | | | | | | | | | | | | :star: | | | | | | | | | | | -| - typed arena | | | | | | | | | | | | | | | | | | | | | | | -| - obj loading | | | | | | | | | | | | :star: | | | | | | | | :star: | | | - ## Running on the Web To run the examples in a browser, run `cargo xtask run-wasm`. Then open `http://localhost:8000` in your browser, and you can choose an example to run. Naturally, in order to display any of the WebGPU based examples, you need to make sure your browser supports it. - -Note that many cannot be downleveled to WebGL as WebGL does (among other things) not support storage texture, storage buffers and compute shaders. Running any example using these feature in a browser will require that browser to support WebGPU. - -## Additional notes - -Note that the examples regarding computing build off of each other; repeated_compute extends hello_compute, hello_workgroups assumes you know the basic workflow of GPU computation, and hello_synchronization assumes you know what a workgroup is. - -All the examples use [WGSL](https://gpuweb.github.io/gpuweb/wgsl.html) shaders unless specified otherwise. - -All framework-based examples render to the window and are reftested against the screenshot in the directory. - -## Hacking - -You can record an API trace for any of the framework-based examples by starting them as: - -```sh -mkdir -p trace && WGPU_TRACE=trace cargo run --features trace --bin wgpu-examples -``` diff --git a/examples/features/src/hello_compute/README.md b/examples/features/src/hello_compute/README.md deleted file mode 100644 index 8b3f3e111d..0000000000 --- a/examples/features/src/hello_compute/README.md +++ /dev/null @@ -1,22 +0,0 @@ -# hello-compute - -Runs a compute shader to determine the number of iterations of the rules from -Collatz Conjecture - -- If n is even, n = n/2 -- If n is odd, n = 3n+1 - -that it will take to finish and reach the number `1`. - -## To Run - -``` -# Pass in any 4 numbers as arguments -RUST_LOG=hello_compute cargo run --bin wgpu-examples hello_compute 1 4 3 295 -``` - -## Example Output - -``` -[2020-04-25T11:15:33Z INFO hello_compute] Steps: [0, 2, 7, 55] -``` diff --git a/examples/features/src/hello_compute/mod.rs b/examples/features/src/hello_compute/mod.rs deleted file mode 100644 index 0f38a1c4b8..0000000000 --- a/examples/features/src/hello_compute/mod.rs +++ /dev/null @@ -1,193 +0,0 @@ -use std::str::FromStr; -use wgpu::util::DeviceExt; - -// Indicates a u32 overflow in an intermediate Collatz value -const OVERFLOW: u32 = 0xffffffff; - -async fn run() { - let numbers = if std::env::args().len() <= 2 { - let default = vec![1, 2, 3, 4]; - println!("No numbers were provided, defaulting to {default:?}"); - default - } else { - std::env::args() - .skip(2) - .map(|s| u32::from_str(&s).expect("You must pass a list of positive integers!")) - .collect() - }; - - let steps = execute_gpu(&numbers).await.unwrap(); - - let disp_steps: Vec = steps - .iter() - .map(|&n| match n { - OVERFLOW => "OVERFLOW".to_string(), - _ => n.to_string(), - }) - .collect(); - - println!("Steps: [{}]", disp_steps.join(", ")); - #[cfg(target_arch = "wasm32")] - log::info!("Steps: [{}]", disp_steps.join(", ")); -} - -async fn execute_gpu(numbers: &[u32]) -> Option> { - // Instantiates instance of WebGPU - let instance = wgpu::Instance::default(); - - // `request_adapter` instantiates the general connection to the GPU - let adapter = instance - .request_adapter(&wgpu::RequestAdapterOptions::default()) - .await?; - - // `request_device` instantiates the feature specific connection to the GPU, defining some parameters, - // `features` being the available features. - let (device, queue) = adapter - .request_device( - &wgpu::DeviceDescriptor { - label: None, - required_features: wgpu::Features::empty(), - required_limits: wgpu::Limits::downlevel_defaults(), - memory_hints: wgpu::MemoryHints::MemoryUsage, - }, - None, - ) - .await - .unwrap(); - - execute_gpu_inner(&device, &queue, numbers).await -} - -async fn execute_gpu_inner( - device: &wgpu::Device, - queue: &wgpu::Queue, - numbers: &[u32], -) -> Option> { - // Loads the shader from WGSL - let cs_module = device.create_shader_module(wgpu::include_wgsl!("shader.wgsl")); - - // Gets the size in bytes of the buffer. - let size = size_of_val(numbers) as wgpu::BufferAddress; - - // Instantiates buffer without data. - // `usage` of buffer specifies how it can be used: - // `BufferUsages::MAP_READ` allows it to be read (outside the shader). - // `BufferUsages::COPY_DST` allows it to be the destination of the copy. - let staging_buffer = device.create_buffer(&wgpu::BufferDescriptor { - label: None, - size, - usage: wgpu::BufferUsages::MAP_READ | wgpu::BufferUsages::COPY_DST, - mapped_at_creation: false, - }); - - // Instantiates buffer with data (`numbers`). - // Usage allowing the buffer to be: - // A storage buffer (can be bound within a bind group and thus available to a shader). - // The destination of a copy. - // The source of a copy. - let storage_buffer = device.create_buffer_init(&wgpu::util::BufferInitDescriptor { - label: Some("Storage Buffer"), - contents: bytemuck::cast_slice(numbers), - usage: wgpu::BufferUsages::STORAGE - | wgpu::BufferUsages::COPY_DST - | wgpu::BufferUsages::COPY_SRC, - }); - - // A bind group defines how buffers are accessed by shaders. - // It is to WebGPU what a descriptor set is to Vulkan. - // `binding` here refers to the `binding` of a buffer in the shader (`layout(set = 0, binding = 0) buffer`). - - // A pipeline specifies the operation of a shader - - // Instantiates the pipeline. - let compute_pipeline = device.create_compute_pipeline(&wgpu::ComputePipelineDescriptor { - label: None, - layout: None, - module: &cs_module, - entry_point: Some("main"), - compilation_options: Default::default(), - cache: None, - }); - - // Instantiates the bind group, once again specifying the binding of buffers. - let bind_group_layout = compute_pipeline.get_bind_group_layout(0); - let bind_group = device.create_bind_group(&wgpu::BindGroupDescriptor { - label: None, - layout: &bind_group_layout, - entries: &[wgpu::BindGroupEntry { - binding: 0, - resource: storage_buffer.as_entire_binding(), - }], - }); - - // A command encoder executes one or many pipelines. - // It is to WebGPU what a command buffer is to Vulkan. - let mut encoder = - device.create_command_encoder(&wgpu::CommandEncoderDescriptor { label: None }); - { - let mut cpass = encoder.begin_compute_pass(&wgpu::ComputePassDescriptor { - label: None, - timestamp_writes: None, - }); - cpass.set_pipeline(&compute_pipeline); - cpass.set_bind_group(0, &bind_group, &[]); - cpass.insert_debug_marker("compute collatz iterations"); - cpass.dispatch_workgroups(numbers.len() as u32, 1, 1); // Number of cells to run, the (x,y,z) size of item being processed - } - // Sets adds copy operation to command encoder. - // Will copy data from storage buffer on GPU to staging buffer on CPU. - encoder.copy_buffer_to_buffer(&storage_buffer, 0, &staging_buffer, 0, size); - - // Submits command encoder for processing - queue.submit(Some(encoder.finish())); - - // Note that we're not calling `.await` here. - let buffer_slice = staging_buffer.slice(..); - // Sets the buffer up for mapping, sending over the result of the mapping back to us when it is finished. - let (sender, receiver) = flume::bounded(1); - buffer_slice.map_async(wgpu::MapMode::Read, move |v| sender.send(v).unwrap()); - - // Poll the device in a blocking manner so that our future resolves. - // In an actual application, `device.poll(...)` should - // be called in an event loop or on another thread. - device.poll(wgpu::Maintain::wait()).panic_on_timeout(); - - // Awaits until `buffer_future` can be read from - if let Ok(Ok(())) = receiver.recv_async().await { - // Gets contents of buffer - let data = buffer_slice.get_mapped_range(); - // Since contents are got in bytes, this converts these bytes back to u32 - let result = bytemuck::cast_slice(&data).to_vec(); - - // With the current interface, we have to make sure all mapped views are - // dropped before we unmap the buffer. - drop(data); - staging_buffer.unmap(); // Unmaps buffer from memory - // If you are familiar with C++ these 2 lines can be thought of similarly to: - // delete myPointer; - // myPointer = NULL; - // It effectively frees the memory - - // Returns data from buffer - Some(result) - } else { - panic!("failed to run compute on gpu!") - } -} - -pub fn main() { - #[cfg(not(target_arch = "wasm32"))] - { - env_logger::init(); - pollster::block_on(run()); - } - #[cfg(target_arch = "wasm32")] - { - std::panic::set_hook(Box::new(console_error_panic_hook::hook)); - console_log::init().expect("could not initialize logger"); - wasm_bindgen_futures::spawn_local(run()); - } -} - -#[cfg(test)] -mod tests; diff --git a/examples/features/src/hello_compute/shader.wgsl b/examples/features/src/hello_compute/shader.wgsl deleted file mode 100644 index 41af4363a2..0000000000 --- a/examples/features/src/hello_compute/shader.wgsl +++ /dev/null @@ -1,38 +0,0 @@ -@group(0) -@binding(0) -var v_indices: array; // this is used as both input and output for convenience - -// The Collatz Conjecture states that for any integer n: -// If n is even, n = n/2 -// If n is odd, n = 3n+1 -// And repeat this process for each new n, you will always eventually reach 1. -// Though the conjecture has not been proven, no counterexample has ever been found. -// This function returns how many times this recurrence needs to be applied to reach 1. -fn collatz_iterations(n_base: u32) -> u32{ - var n: u32 = n_base; - var i: u32 = 0u; - loop { - if (n <= 1u) { - break; - } - if (n % 2u == 0u) { - n = n / 2u; - } - else { - // Overflow? (i.e. 3*n + 1 > 0xffffffffu?) - if (n >= 1431655765u) { // 0x55555555u - return 4294967295u; // 0xffffffffu - } - - n = 3u * n + 1u; - } - i = i + 1u; - } - return i; -} - -@compute -@workgroup_size(1) -fn main(@builtin(global_invocation_id) global_id: vec3) { - v_indices[global_id.x] = collatz_iterations(v_indices[global_id.x]); -} diff --git a/examples/features/src/hello_compute/tests.rs b/examples/features/src/hello_compute/tests.rs deleted file mode 100644 index f4554d7de5..0000000000 --- a/examples/features/src/hello_compute/tests.rs +++ /dev/null @@ -1,106 +0,0 @@ -use super::*; -use wgpu_test::{gpu_test, FailureCase, GpuTestConfiguration, TestParameters}; - -#[gpu_test] -static COMPUTE_1: GpuTestConfiguration = GpuTestConfiguration::new() - .parameters( - TestParameters::default() - .downlevel_flags(wgpu::DownlevelFlags::COMPUTE_SHADERS) - .limits(wgpu::Limits::downlevel_defaults()) - .skip(FailureCase::adapter("V3D")), - ) - .run_async(|ctx| { - let input = &[1, 2, 3, 4]; - - async move { assert_execute_gpu(&ctx.device, &ctx.queue, input, &[0, 1, 7, 2]).await } - }); - -#[gpu_test] -static COMPUTE_2: GpuTestConfiguration = GpuTestConfiguration::new() - .parameters( - TestParameters::default() - .downlevel_flags(wgpu::DownlevelFlags::COMPUTE_SHADERS) - .limits(wgpu::Limits::downlevel_defaults()) - .skip(FailureCase::adapter("V3D")), - ) - .run_async(|ctx| { - let input = &[5, 23, 10, 9]; - - async move { assert_execute_gpu(&ctx.device, &ctx.queue, input, &[5, 15, 6, 19]).await } - }); - -#[gpu_test] -static COMPUTE_OVERFLOW: GpuTestConfiguration = GpuTestConfiguration::new() - .parameters( - TestParameters::default() - .downlevel_flags(wgpu::DownlevelFlags::COMPUTE_SHADERS) - .limits(wgpu::Limits::downlevel_defaults()) - .skip(FailureCase::adapter("V3D")), - ) - .run_async(|ctx| { - let input = &[77031, 837799, 8400511, 63728127]; - async move { - assert_execute_gpu( - &ctx.device, - &ctx.queue, - input, - &[350, 524, OVERFLOW, OVERFLOW], - ) - .await - } - }); - -#[cfg(not(target_arch = "wasm32"))] -#[gpu_test] -static MULTITHREADED_COMPUTE: GpuTestConfiguration = GpuTestConfiguration::new() - .parameters( - TestParameters::default() - .downlevel_flags(wgpu::DownlevelFlags::COMPUTE_SHADERS) - .limits(wgpu::Limits::downlevel_defaults()) - .skip(FailureCase::adapter("V3D")), - ) - .run_sync(|ctx| { - use std::{sync::mpsc, sync::Arc, thread, time::Duration}; - - let ctx = Arc::new(ctx); - - let thread_count = 8; - - let (tx, rx) = mpsc::channel(); - let workers: Vec<_> = (0..thread_count) - .map(move |_| { - let tx = tx.clone(); - let ctx = Arc::clone(&ctx); - thread::spawn(move || { - let input = &[100, 100, 100]; - pollster::block_on(assert_execute_gpu( - &ctx.device, - &ctx.queue, - input, - &[25, 25, 25], - )); - tx.send(true).unwrap(); - }) - }) - .collect(); - - for _ in 0..thread_count { - rx.recv_timeout(Duration::from_secs(10)) - .expect("A thread never completed."); - } - - for worker in workers { - worker.join().unwrap(); - } - }); - -async fn assert_execute_gpu( - device: &wgpu::Device, - queue: &wgpu::Queue, - input: &[u32], - expected: &[u32], -) { - if let Some(produced) = execute_gpu_inner(device, queue, input).await { - assert_eq!(produced, expected); - } -} diff --git a/examples/features/src/lib.rs b/examples/features/src/lib.rs index ccef69199f..6efca36688 100644 --- a/examples/features/src/lib.rs +++ b/examples/features/src/lib.rs @@ -8,8 +8,6 @@ pub mod boids; pub mod bunnymark; pub mod conservative_raster; pub mod cube; -pub mod hello; -pub mod hello_compute; pub mod hello_synchronization; pub mod hello_triangle; pub mod hello_windows; diff --git a/examples/features/src/main.rs b/examples/features/src/main.rs index 482da930d8..a5b4ad6732 100644 --- a/examples/features/src/main.rs +++ b/examples/features/src/main.rs @@ -32,18 +32,6 @@ const EXAMPLES: &[ExampleDesc] = &[ webgl: true, webgpu: true, }, - ExampleDesc { - name: "hello", - function: wgpu_examples::hello::main, - webgl: false, // No canvas for WebGL - webgpu: true, - }, - ExampleDesc { - name: "hello_compute", - function: wgpu_examples::hello_compute::main, - webgl: false, // No compute - webgpu: true, - }, ExampleDesc { name: "hello_synchronization", function: wgpu_examples::hello_synchronization::main, diff --git a/examples/standalone/1_hello_compute/Cargo.toml b/examples/standalone/1_hello_compute/Cargo.toml new file mode 100644 index 0000000000..0ec65de534 --- /dev/null +++ b/examples/standalone/1_hello_compute/Cargo.toml @@ -0,0 +1,11 @@ +[package] +name = "wgpu-hello-compute" +edition = "2021" +rust-version = "1.83" +publish = false + +[dependencies] +bytemuck = "1" +env_logger = "0.11.6" +pollster = "0.4" +wgpu = "24.0.0" diff --git a/examples/standalone/1_hello_compute/cargo-generate.toml b/examples/standalone/1_hello_compute/cargo-generate.toml new file mode 100644 index 0000000000..8b13789179 --- /dev/null +++ b/examples/standalone/1_hello_compute/cargo-generate.toml @@ -0,0 +1 @@ + diff --git a/examples/standalone/1_hello_compute/src/main.rs b/examples/standalone/1_hello_compute/src/main.rs new file mode 100644 index 0000000000..9decdef0df --- /dev/null +++ b/examples/standalone/1_hello_compute/src/main.rs @@ -0,0 +1,255 @@ +/// To serve as an introduction to the wgpu api, we will implement a simple +/// compute shader which takes a list of numbers on the CPU and doubles them on the GPU. +/// +/// While this isn't a very practical example, you will see all the major components +/// of using wgpu headlessly, including getting a device, running a shader, and transferring +/// data between the CPU and GPU. +/// +/// If you time the recording and execution of this example you will certainly see that +/// running on the gpu is slower than doing the same calculation on the cpu. This is because +/// floating point multiplication is a very simple operation so the transfer/submission overhead +/// is quite a lot higher than the actual computation. This is normal and shows that the GPU +/// needs a lot higher work/transfer ratio to come out ahead. +use std::{num::NonZeroU64, str::FromStr}; +use wgpu::util::DeviceExt; + +fn main() { + // Parse all arguments as floats. We need to skip argument 0, which is the name of the program. + let arguments: Vec = std::env::args() + .skip(1) + .map(|s| { + f32::from_str(&s).unwrap_or_else(|_| panic!("Cannot parse argument {s:?} as a float.")) + }) + .collect(); + + if arguments.is_empty() { + println!("No arguments provided. Please provide a list of numbers to double."); + return; + } + + println!("Parsed {} arguments", arguments.len()); + + // wgpu uses `log` for all of our logging, so we initialize a logger with the `env_logger` crate. + // + // To change the log level, set the `RUST_LOG` environment variable. See the `env_logger` + // documentation for more information. + env_logger::init(); + + // We first initialize an wgpu `Instance`, which contains any "global" state wgpu needs. + // + // This is what loads the vulkan/dx12/metal/opengl libraries. + let instance = wgpu::Instance::new(&wgpu::InstanceDescriptor::default()); + + // We then create an `Adapter` which represents a physical gpu in the system. It allows + // us to query information about it and create a `Device` from it. + // + // This function is asynchronous in WebGPU, so request_adapter returns a future. On native/webgl + // the future resolves immediately, so we can block on it without harm. + let adapter = + pollster::block_on(instance.request_adapter(&wgpu::RequestAdapterOptions::default())) + .expect("Failed to create adapter"); + + // Print out some basic information about the adapter. + println!("Running on Adapter: {:#?}", adapter.get_info()); + + // Check to see if the adapter supports compute shaders. While WebGPU guarantees support for + // compute shaders, wgpu supports a wider range of devices through the use of "downlevel" devices. + let downlevel_capabilities = adapter.get_downlevel_capabilities(); + if !downlevel_capabilities + .flags + .contains(wgpu::DownlevelFlags::COMPUTE_SHADERS) + { + panic!("Adapter does not support compute shaders"); + } + + // We then create a `Device` and a `Queue` from the `Adapter`. + // + // The `Device` is used to create and manage GPU resources. + // The `Queue` is a queue used to submit work for the GPU to process. + let (device, queue) = pollster::block_on(adapter.request_device( + &wgpu::DeviceDescriptor { + label: None, + required_features: wgpu::Features::empty(), + required_limits: wgpu::Limits::downlevel_defaults(), + memory_hints: wgpu::MemoryHints::MemoryUsage, + }, + None, + )) + .expect("Failed to create device"); + + // Create a shader module from our shader code. This will parse and validate the shader. + // + // `include_wgsl` is a macro provided by wgpu like `include_str` which constructs a ShaderModuleDescriptor. + // If you want to load shaders differently, you can construct the ShaderModuleDescriptor manually. + let module = device.create_shader_module(wgpu::include_wgsl!("shader.wgsl")); + + // Create a buffer with the data we want to process on the GPU. + // + // `create_buffer_init` is a utility provided by `wgpu::util::DeviceExt` which simplifies creating + // a buffer with some initial data. + // + // We use the `bytemuck` crate to cast the slice of f32 to a &[u8] to be uploaded to the GPU. + let input_data_buffer = device.create_buffer_init(&wgpu::util::BufferInitDescriptor { + label: None, + contents: bytemuck::cast_slice(&arguments), + usage: wgpu::BufferUsages::STORAGE, + }); + + // Now we create a buffer to store the output data. + let output_data_buffer = device.create_buffer(&wgpu::BufferDescriptor { + label: None, + size: input_data_buffer.size(), + usage: wgpu::BufferUsages::STORAGE | wgpu::BufferUsages::COPY_SRC, + mapped_at_creation: false, + }); + + // Finally we create a buffer which can be read by the CPU. This buffer is how we will read + // the data. We need to use a separate buffer because we need to have a usage of `MAP_READ`, + // and that usage can only be used with `COPY_DST`. + let download_buffer = device.create_buffer(&wgpu::BufferDescriptor { + label: None, + size: input_data_buffer.size(), + usage: wgpu::BufferUsages::COPY_DST | wgpu::BufferUsages::MAP_READ, + mapped_at_creation: false, + }); + + // A bind group layout describes the types of resources that a bind group can contain. Think + // of this like a C-style header declaration, ensuring both the pipeline and bind group agree + // on the types of resources. + let bind_group_layout = device.create_bind_group_layout(&wgpu::BindGroupLayoutDescriptor { + label: None, + entries: &[ + // Input buffer + wgpu::BindGroupLayoutEntry { + binding: 0, + visibility: wgpu::ShaderStages::COMPUTE, + ty: wgpu::BindingType::Buffer { + ty: wgpu::BufferBindingType::Storage { read_only: true }, + // This is the size of a single element in the buffer. + min_binding_size: Some(NonZeroU64::new(4).unwrap()), + has_dynamic_offset: false, + }, + count: None, + }, + // Output buffer + wgpu::BindGroupLayoutEntry { + binding: 1, + visibility: wgpu::ShaderStages::COMPUTE, + ty: wgpu::BindingType::Buffer { + ty: wgpu::BufferBindingType::Storage { read_only: false }, + // This is the size of a single element in the buffer. + min_binding_size: Some(NonZeroU64::new(4).unwrap()), + has_dynamic_offset: false, + }, + count: None, + }, + ], + }); + + // The bind group contains the actual resources to bind to the pipeline. + // + // Even when the buffers are individually dropped, wgpu will keep the bind group and buffers + // alive until the bind group itself is dropped. + let bind_group = device.create_bind_group(&wgpu::BindGroupDescriptor { + label: None, + layout: &bind_group_layout, + entries: &[ + wgpu::BindGroupEntry { + binding: 0, + resource: input_data_buffer.as_entire_binding(), + }, + wgpu::BindGroupEntry { + binding: 1, + resource: output_data_buffer.as_entire_binding(), + }, + ], + }); + + // The pipeline layout describes the bind groups that a pipeline expects + let pipeline_layout = device.create_pipeline_layout(&wgpu::PipelineLayoutDescriptor { + label: None, + bind_group_layouts: &[&bind_group_layout], + push_constant_ranges: &[], + }); + + // The pipeline is the ready-to-go program state for the GPU. It contains the shader modules, + // the interfaces (bind group layouts) and the shader entry point. + let pipeline = device.create_compute_pipeline(&wgpu::ComputePipelineDescriptor { + label: None, + layout: Some(&pipeline_layout), + module: &module, + entry_point: Some("doubleMe"), + compilation_options: wgpu::PipelineCompilationOptions::default(), + cache: None, + }); + + // The command encoder allows us to record commands that we will later submit to the GPU. + let mut encoder = + device.create_command_encoder(&wgpu::CommandEncoderDescriptor { label: None }); + + // A compute pass is a single series of compute operations. While we are recording a compute + // pass, we cannot record to the encoder. + let mut compute_pass = encoder.begin_compute_pass(&wgpu::ComputePassDescriptor { + label: None, + timestamp_writes: None, + }); + + // Set the pipeline that we want to use + compute_pass.set_pipeline(&pipeline); + // Set the bind group that we want to use + compute_pass.set_bind_group(0, &bind_group, &[]); + + // Now we dispatch a series of workgroups. Each workgroup is a 3D grid of individual programs. + // + // We defined the workgroup size in the shader as 64x1x1. So in order to process all of our + // inputs, we ceiling divide the number of inputs by 64. If the user passes 32 inputs, we will + // dispatch 1 workgroups. If the user passes 65 inputs, we will dispatch 2 workgroups, etc. + let workgroup_count = arguments.len().div_ceil(64); + compute_pass.dispatch_workgroups(workgroup_count as u32, 1, 1); + + // Now we drop the compute pass, giving us access to the encoder again. + drop(compute_pass); + + // We add a copy operation to the encoder. This will copy the data from the output buffer on the + // GPU to the download buffer on the CPU. + encoder.copy_buffer_to_buffer( + &output_data_buffer, + 0, + &download_buffer, + 0, + output_data_buffer.size(), + ); + + // We finish the encoder, giving us a fully recorded command buffer. + let command_buffer = encoder.finish(); + + // At this point nothing has actually been executed on the gpu. We have recorded a series of + // commands that we want to execute, but they haven't been sent to the gpu yet. + // + // Submitting to the queue sends the command buffer to the gpu. The gpu will then execute the + // commands in the command buffer in order. + queue.submit([command_buffer]); + + // We now map the download buffer so we can read it. Mapping tells wgpu that we want to read/write + // to the buffer directly by the CPU and it should not permit any more GPU operations on the buffer. + // + // Mapping requires that the GPU be finished using the buffer before it resolves, so mapping has a callback + // to tell you when the mapping is complete. + let buffer_slice = download_buffer.slice(..); + buffer_slice.map_async(wgpu::MapMode::Read, |_| { + // In this case we know exactly when the mapping will be finished, + // so we don't need to do anything in the callback. + }); + + // Wait for the GPU to finish working on the submitted work. This doesn't work on WebGPU, so we would need + // to rely on the callback to know when the buffer is mapped. + device.poll(wgpu::Maintain::Wait); + + // We can now read the data from the buffer. + let data = buffer_slice.get_mapped_range(); + // Convert the data back to a slice of f32. + let result: &[f32] = bytemuck::cast_slice(&data); + + // Print out the result. + println!("Result: {:?}", result); +} diff --git a/examples/standalone/1_hello_compute/src/shader.wgsl b/examples/standalone/1_hello_compute/src/shader.wgsl new file mode 100644 index 0000000000..9ae3a341cc --- /dev/null +++ b/examples/standalone/1_hello_compute/src/shader.wgsl @@ -0,0 +1,27 @@ +// Input to the shader. The length of the array is determined by what buffer is bound. +// +// Out of bounds accesses +@group(0) @binding(0) +var input: array; +// Output of the shader. +@group(0) @binding(1) +var output: array; + +// Ideal workgroup size depends on the hardware, the workload, and other factors. However, it should +// _generally_ be a multiple of 64. Common sizes are 64x1x1, 256x1x1; or 8x8x1, 16x16x1 for 2D workloads. +@compute @workgroup_size(64) +fn doubleMe(@builtin(global_invocation_id) global_id: vec3) { + // While compute invocations are 3d, we're only using one dimension. + let index = global_id.x; + + // Because we're using a workgroup size of 64, if the input size isn't a multiple of 64, + // we will have some "extra" invocations. This is fine, but we should tell them to stop + // to avoid out-of-bounds accesses. + let array_length = arrayLength(input); + if (global_id.x >= array_length) { + return; + } + + // Do the multiply by two and write to the output. + output[global_id.x] = input[global_id.x] * 2.0; +}