Missed optimization/perf oddity with allocations #128854

peterwmwong · 2024-08-08T22:18:06Z

Consider the following minimized example:

pub fn test() {
    for _ in 0..128 {
        let _ = vec![0; 32];
    }
}

Expected generated output (Rust 1.81.0):

example::test::h15f9c44deb8efbb9:
        ret

Actual output (Rust Nightly):

example::test::h15f9c44deb8efbb9:
        mov     rax, qword ptr [rip + __rust_no_alloc_shim_is_unstable@GOTPCREL]
        movzx   ecx, byte ptr [rax]
        movzx   ecx, byte ptr [rax]
        movzx   ecx, byte ptr [rax]
        movzx   ecx, byte ptr [rax]
        movzx   ecx, byte ptr [rax]
        movzx   ecx, byte ptr [rax]
        movzx   ecx, byte ptr [rax]
        movzx   ecx, byte ptr [rax]
        movzx   ecx, byte ptr [rax]
        movzx   ecx, byte ptr [rax]
        movzx   ecx, byte ptr [rax]
        movzx   ecx, byte ptr [rax]
        movzx   ecx, byte ptr [rax]
        movzx   ecx, byte ptr [rax]
        movzx   ecx, byte ptr [rax]
        movzx   ecx, byte ptr [rax]
        movzx   ecx, byte ptr [rax]
        movzx   ecx, byte ptr [rax]
        movzx   ecx, byte ptr [rax]
        movzx   ecx, byte ptr [rax]
        movzx   ecx, byte ptr [rax]
        movzx   ecx, byte ptr [rax]
        movzx   ecx, byte ptr [rax]
        movzx   ecx, byte ptr [rax]
        movzx   ecx, byte ptr [rax]
        movzx   ecx, byte ptr [rax]
        movzx   ecx, byte ptr [rax]
        movzx   ecx, byte ptr [rax]
        movzx   ecx, byte ptr [rax]
        movzx   ecx, byte ptr [rax]
        movzx   ecx, byte ptr [rax]
        movzx   ecx, byte ptr [rax]
        movzx   ecx, byte ptr [rax]
        movzx   ecx, byte ptr [rax]
        movzx   ecx, byte ptr [rax]
        movzx   ecx, byte ptr [rax]
        movzx   ecx, byte ptr [rax]
        movzx   ecx, byte ptr [rax]
        movzx   ecx, byte ptr [rax]
        movzx   ecx, byte ptr [rax]
        movzx   ecx, byte ptr [rax]
        movzx   ecx, byte ptr [rax]
        movzx   ecx, byte ptr [rax]
        movzx   ecx, byte ptr [rax]
        movzx   ecx, byte ptr [rax]
        movzx   ecx, byte ptr [rax]
        movzx   ecx, byte ptr [rax]
        movzx   ecx, byte ptr [rax]
        movzx   ecx, byte ptr [rax]
        movzx   ecx, byte ptr [rax]
        movzx   ecx, byte ptr [rax]
        movzx   ecx, byte ptr [rax]
        movzx   ecx, byte ptr [rax]
        movzx   ecx, byte ptr [rax]
        movzx   ecx, byte ptr [rax]
        movzx   ecx, byte ptr [rax]
        movzx   ecx, byte ptr [rax]
        movzx   ecx, byte ptr [rax]
        movzx   ecx, byte ptr [rax]
        movzx   ecx, byte ptr [rax]
        movzx   ecx, byte ptr [rax]
        movzx   ecx, byte ptr [rax]
        movzx   ecx, byte ptr [rax]
        movzx   ecx, byte ptr [rax]
        movzx   ecx, byte ptr [rax]
        movzx   ecx, byte ptr [rax]
        movzx   ecx, byte ptr [rax]
        movzx   ecx, byte ptr [rax]
        movzx   ecx, byte ptr [rax]
        movzx   ecx, byte ptr [rax]
        movzx   ecx, byte ptr [rax]
        movzx   ecx, byte ptr [rax]
        movzx   ecx, byte ptr [rax]
        movzx   ecx, byte ptr [rax]
        movzx   ecx, byte ptr [rax]
        movzx   ecx, byte ptr [rax]
        movzx   ecx, byte ptr [rax]
        movzx   ecx, byte ptr [rax]
        movzx   ecx, byte ptr [rax]
        movzx   ecx, byte ptr [rax]
        movzx   ecx, byte ptr [rax]
        movzx   ecx, byte ptr [rax]
        movzx   ecx, byte ptr [rax]
        movzx   ecx, byte ptr [rax]
        movzx   ecx, byte ptr [rax]
        movzx   ecx, byte ptr [rax]
        movzx   ecx, byte ptr [rax]
        movzx   ecx, byte ptr [rax]
        movzx   ecx, byte ptr [rax]
        movzx   ecx, byte ptr [rax]
        movzx   ecx, byte ptr [rax]
        movzx   ecx, byte ptr [rax]
        movzx   ecx, byte ptr [rax]
        movzx   ecx, byte ptr [rax]
        movzx   ecx, byte ptr [rax]
        movzx   ecx, byte ptr [rax]
        movzx   ecx, byte ptr [rax]
        movzx   ecx, byte ptr [rax]
        movzx   ecx, byte ptr [rax]
        movzx   ecx, byte ptr [rax]
        movzx   ecx, byte ptr [rax]
        movzx   ecx, byte ptr [rax]
        movzx   ecx, byte ptr [rax]
        movzx   ecx, byte ptr [rax]
        movzx   ecx, byte ptr [rax]
        movzx   ecx, byte ptr [rax]
        movzx   ecx, byte ptr [rax]
        movzx   ecx, byte ptr [rax]
        movzx   ecx, byte ptr [rax]
        movzx   ecx, byte ptr [rax]
        movzx   ecx, byte ptr [rax]
        movzx   ecx, byte ptr [rax]
        movzx   ecx, byte ptr [rax]
        movzx   ecx, byte ptr [rax]
        movzx   ecx, byte ptr [rax]
        movzx   ecx, byte ptr [rax]
        movzx   ecx, byte ptr [rax]
        movzx   ecx, byte ptr [rax]
        movzx   ecx, byte ptr [rax]
        movzx   ecx, byte ptr [rax]
        movzx   ecx, byte ptr [rax]
        movzx   ecx, byte ptr [rax]
        movzx   ecx, byte ptr [rax]
        movzx   ecx, byte ptr [rax]
        movzx   ecx, byte ptr [rax]
        movzx   ecx, byte ptr [rax]
        movzx   ecx, byte ptr [rax]
        movzx   eax, byte ptr [rax]
        ret

Godbolt: https://www.godbolt.org/z/x458Pv8P5

workingjubilee · 2024-08-08T22:25:14Z

I know this is a "minimal reduction" but is there an example where this impacts actual programs?

Artikae · 2024-08-10T01:06:40Z

For reference, the actual cause, as far as I can tell, is this line in the alloc function.

// Make sure we don't accidentally allow omitting the allocator shim in
// stable code until it is actually stabilized.
core::ptr::read_volatile(&__rust_no_alloc_shim_is_unstable);

It, of course, can't be optimized out because it's volatile.

That line doesn't appear in alloc_zeroed, for some reason. (Maybe a bug?)

peterwmwong · 2024-08-13T18:58:51Z

Just noticed a helpful rundown of the cause, why, and perf potential if fixed (@Kobzol's perf run) in this zulip conversation

saethlin · 2024-09-18T02:47:26Z

Would you count this as fixed if we make the codegen match by regressing the good case? #130497

…=<try> read_volatile __rust_no_alloc_shim_is_unstable in alloc_zeroed rust-lang#128854 (comment) r? `@ghost`

…=bjorn3 read_volatile __rust_no_alloc_shim_is_unstable in alloc_zeroed It was pointed out in rust-lang#128854 (comment) that the magic volatile read was probably missing from `alloc_zeroed`. I can't find any mention of `alloc_zeroed` on rust-lang#86844, so it looks like this was just missed initially.

peterwmwong · 2024-09-21T20:49:07Z

Bold move @saethlin 😆. I've updated bug description/repro.

Y'all do whatever you want with this, just an observation I made a while back looking into Mojo's claims being faster than Rust.

findepi · 2025-02-11T19:34:41Z

I know this is a "minimal reduction" but is there an example where this impacts actual programs?

~~If this issue lacks a motivating example, let me add one. (If it doesn't, sorry for spam)~~
(Edit: it turned out to be not a good example, see #128854 (comment) for update)

details

I am prototyping on a design for DataFusion scalar function vectorization without boilerplateL apache/datafusion#12635
For certain lazy evaluation needs I'd like to have functions that return impl Fn which in turn could return impl Fn...
Returning impl Fn that returns impl Fn is of course impossible, but returning them as Box<dyn Fn> is, and the compiler nicely optimizes away all the dyn calls. However, the remnants of the Box allocations remain.

Benchmarking the code below shows that the second function is 6x slower than the first on my machine.

// Simple Result type. The Result wrappers get optimized away nicely
type Result<T> = std::result::Result<T, String>; 

fn simple_sum(a: i32, b: i32, c: i32, d: i32) -> Result<i32> {
    Ok(a + b + c + d)
}

fn curried_sum(a: i32, b: i32, c: i32, d: i32) -> Result<i32> {
    // the arithemtics gets inlined nicely
    Ok(fn_fn_fn_fn(a)?(b)?(c)?(d)?)
}

fn fn_fn_fn_fn(a: i32) -> Result<Box<dyn Fn(i32) -> Result<Box<dyn Fn(i32) -> Result<Box<dyn Fn(i32) -> Result<i32>>>>>>>
{
    Ok(Box::new(move |b| Ok(Box::new(move |c| Ok(Box::new(move |d| Ok(a + b + c + d)))))))
}

These functional-style functions are going to be used via template functions. The odd syntax is really important to make the templating work. I.e. it's easy to support no lazy eval (no Fn returns), or all lazy eval, but much harder to support mix in a generic fashion.

saethlin · 2025-02-11T22:59:40Z

I definitely see a different instruction sequence that is fixed by removing the volatile read, but on x86_64 these microbenchmark to the same throughput.

So this is a nice use case and I personally hate the read_volatile hack that keeps being used because it's a kludge for a missing compiler feature, but I'm not sure your demo will raise the priority of this.

findepi · 2025-02-12T11:02:14Z

I definitely see a different instruction sequence that is fixed by removing the volatile read, but on x86_64 these microbenchmark to the same throughput.

@saethlin thank you for your response!
I did more benchmarks myself and I agree with you. I take back my motivating example, it's as as staggering¹.
The 6x difference in the benchmark results I am seeing is not because of those reads, it's because of inlining.

Details in https://gist.github.com/findepi/89497d13a3a249a1d2d1b6d7c2f8b927
Summary: two functions despite yielding same assembly code (with patched compiler) inline very differently in a calling loop (without and with patched compiler - doesn't matter).
This is some other optimization opportunity (≠ this issue).

With inlining disabled the difference is pretty consistently reproducible on my machine and is about 4-5% for this benchmark, so these extra assembly instructions have some impact on performance. IDK, whether 4% is ignorable for compiler, but it's definitely ignorable for me at this point. ↩

rustbot added the needs-triage This issue may need triage. Remove it if it has been sufficiently triaged. label Aug 8, 2024

jieyouxu added T-compiler Relevant to the compiler team, which will review and decide on the PR/issue. C-optimization Category: An issue highlighting optimization opportunities or PRs implementing such labels Aug 13, 2024

saethlin mentioned this issue Sep 18, 2024

read_volatile __rust_no_alloc_shim_is_unstable in alloc_zeroed #130497

Merged

bors added a commit to rust-lang-ci/rust that referenced this issue Sep 18, 2024

Auto merge of rust-lang#130497 - saethlin:alloc-zeroed-is-unstable, r…

e72f848

…=<try> read_volatile __rust_no_alloc_shim_is_unstable in alloc_zeroed rust-lang#128854 (comment) r? `@ghost`

Provide feedback

Saved searches

Use saved searches to filter your results more quickly

Missed optimization/perf oddity with allocations #128854

Missed optimization/perf oddity with allocations #128854

peterwmwong commented Aug 8, 2024 •

edited

Loading

workingjubilee commented Aug 8, 2024

Artikae commented Aug 10, 2024

peterwmwong commented Aug 13, 2024

saethlin commented Sep 18, 2024

peterwmwong commented Sep 21, 2024

findepi commented Feb 11, 2025 •

edited

Loading

saethlin commented Feb 11, 2025 •

edited

Loading

findepi commented Feb 12, 2025 •

edited

Loading

Missed optimization/perf oddity with allocations #128854

Missed optimization/perf oddity with allocations #128854

Comments

peterwmwong commented Aug 8, 2024 • edited Loading

workingjubilee commented Aug 8, 2024

Artikae commented Aug 10, 2024

peterwmwong commented Aug 13, 2024

saethlin commented Sep 18, 2024

peterwmwong commented Sep 21, 2024

findepi commented Feb 11, 2025 • edited Loading

saethlin commented Feb 11, 2025 • edited Loading

findepi commented Feb 12, 2025 • edited Loading

Footnotes

peterwmwong commented Aug 8, 2024 •

edited

Loading

findepi commented Feb 11, 2025 •

edited

Loading

saethlin commented Feb 11, 2025 •

edited

Loading

findepi commented Feb 12, 2025 •

edited

Loading