From 2bd6230d0457a3c336d5d6ff0de77390c436b2eb Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Bj=C3=B6rn=20Gustavsson?= Date: Sat, 6 Jan 2024 08:58:44 +0100 Subject: [PATCH] AArch64: Enhance the effectiveness of the register cache 365224b5b62183 introduced a cache to keep track of BEAM registers that had already been loaded into CPU registers. This commit further improves the effectiveness of the cache. Now 21,109 loads of BEAM registers are avoided, compared to 15,098 loads before this commit. (When loading all modules in OTP and the standard library of Elixir 16.0.0.) Of those load instructions, 8,975 instructions were no longer needed because the contents of the BEAM register was already present in the desired CPU register. Before this commit, only 1,861 load instructions were eliminated. The remaining load instructions are replaced with a `mov` instruction (from register to register), which is more efficient than a load instruction but does not reduce the code size. As an example, the following BEAM code: {test,is_nonempty_list,{f,3},[{y,0}]}. {get_hd,{y,0},{x,0}}. would be translated to native code like so: # is_nonempty_list_fS ldr x8, [x20] tbnz x8, 1, @label_3-1 # get_hd_Sd ldr x8, [x20] ldur x25, [x8, -1] That is, `{y,0}` would be loaded into a CPU register twice. The improved caching avoids reloading `{y,0}` in the `get_hd` instruction: # is_nonempty_list_fS ldr x8, [x20] tbnz x8, 1, @label_3-1 # get_hd_Sd # skipped fetching of BEAM register ldur x25, [x8, -1] --- erts/emulator/beam/jit/arm/beam_asm.hpp | 309 ++++++++++++------ erts/emulator/beam/jit/arm/instr_arith.cpp | 10 +- erts/emulator/beam/jit/arm/instr_bif.cpp | 4 +- erts/emulator/beam/jit/arm/instr_bs.cpp | 2 +- erts/emulator/beam/jit/arm/instr_common.cpp | 193 ++++++----- .../beam/jit/arm/instr_guard_bifs.cpp | 2 +- erts/emulator/beam/jit/arm/instr_select.cpp | 2 +- erts/emulator/beam/jit/arm/ops.tab | 3 +- 8 files changed, 342 insertions(+), 183 deletions(-) diff --git a/erts/emulator/beam/jit/arm/beam_asm.hpp b/erts/emulator/beam/jit/arm/beam_asm.hpp index 1e78abfc2c39..1362767c4c4f 100644 --- a/erts/emulator/beam/jit/arm/beam_asm.hpp +++ b/erts/emulator/beam/jit/arm/beam_asm.hpp @@ -961,95 +961,149 @@ class BeamModuleAssembler : public BeamAssembler, /* Skip unnecessary moves in load_source(), load_sources(), and * mov_arg(). Don't use these variables directly. */ size_t last_destination_offset = 0; - arm::Gp last_destination_from1, last_destination_from2; - arm::Mem last_destination_to1, last_destination_to2; - /* Private helper. */ - void preserve__cache(arm::Gp dst) { - last_destination_offset = a.offset(); - invalidate_cache(dst); - } + struct CacheEntry { + arm::Mem mem; + arm::Gp reg; + }; + + static const int max_cache_entries = 16; + CacheEntry cache[max_cache_entries]; + int num_cache_entries = 0; bool is_cache_valid() { return a.offset() == last_destination_offset; } - /* Works as the STR instruction, but also updates the cache. */ - void str_cache(arm::Gp src, arm::Mem dst) { - if (a.offset() == last_destination_offset && - dst != last_destination_to1) { - /* Something is already cached in the first slot. Use the - * second slot. */ - a.str(src, dst); - last_destination_offset = a.offset(); - last_destination_to2 = dst; - last_destination_from2 = src; - } else { - /* Nothing cached yet, or the first slot has the same - * memory address as we will store into. Use the first - * slot and invalidate the second slot. */ - a.str(src, dst); - last_destination_offset = a.offset(); - last_destination_to1 = dst; - last_destination_from1 = src; - last_destination_to2 = arm::Mem(); + void consolidate_cache() { + if (!is_cache_valid()) { + num_cache_entries = 0; } - } - /* Works as the STP instruction, but also updates the cache. */ - void stp_cache(arm::Gp src1, arm::Gp src2, arm::Mem dst) { - safe_stp(src1, src2, dst); last_destination_offset = a.offset(); - last_destination_to1 = dst; - last_destination_from1 = src1; - last_destination_to2 = - arm::Mem(arm::GpX(dst.baseId()), dst.offset() + 8); - last_destination_from2 = src2; } void invalidate_cache(arm::Gp dst) { - if (dst == last_destination_from1) { - last_destination_to1 = arm::Mem(); - last_destination_from1 = arm::Gp(); - } - if (dst == last_destination_from2) { - last_destination_to2 = arm::Mem(); - last_destination_from2 = arm::Gp(); + for (int i = 0; i < num_cache_entries; i++) { + if (dst == cache[i].reg) { + cache[i].mem = arm::Mem(); + cache[i].reg = arm::Gp(); + } else if (cache[i].mem.hasBase() && + cache[i].mem.baseReg() == dst) { + cache[i].mem = arm::Mem(); + cache[i].reg = arm::Gp(); + } } } - /* Works like LDR, but looks in the cache first. */ - void ldr_cached(arm::Gp dst, arm::Mem mem) { - if (a.offset() == last_destination_offset) { - arm::Gp cached_reg; - if (mem == last_destination_to1) { - cached_reg = last_destination_from1; - } else if (mem == last_destination_to2) { - cached_reg = last_destination_from2; + void put_cache_entry(arm::Mem mem, arm::Gp reg) { + int slot; + + if (reg == SUPER_TMP) { + return; + } + + for (slot = 0; slot < num_cache_entries; slot++) { + if (cache[slot].mem == mem) { + break; } + } - if (cached_reg.isValid()) { - /* This memory location is cached. */ - if (cached_reg != dst) { - comment("simplified fetching of BEAM register"); - a.mov(dst, cached_reg); - preserve__cache(dst); + if (slot >= num_cache_entries) { + for (slot = 0; slot < num_cache_entries; slot++) { + if (!cache[slot].mem.hasBase()) { + break; + } + } + + if (slot >= num_cache_entries) { + if (num_cache_entries < max_cache_entries) { + slot = num_cache_entries++; } else { - comment("skipped fetching of BEAM register"); - invalidate_cache(dst); + slot = 0; } + } + cache[slot].mem = mem; + } + + cache[slot].reg = reg; + } + + arm::Gp find_cache(arm::Mem mem) { + consolidate_cache(); + + for (int slot = 0; slot < num_cache_entries; slot++) { + if (mem == cache[slot].mem) { + ASSERT(cache[slot].reg.isValid()); + return cache[slot].reg; + } + } + + return arm::Gp(); + } + + /* Works as the STR instruction, but also updates the cache. */ + void str_cache(arm::Gp src, arm::Mem dst) { + consolidate_cache(); + invalidate_cache(src); + + a.str(src, dst); + + put_cache_entry(dst, src); + last_destination_offset = a.offset(); + } + + /* Works as the STP instruction, but also updates the cache. */ + void stp_cache(arm::Gp src1, arm::Gp src2, arm::Mem dst) { + arm::Mem next_dst = arm::Mem(arm::GpX(dst.baseId()), dst.offset() + 8); + + consolidate_cache(); + invalidate_cache(src1); + invalidate_cache(src2); + + safe_stp(src1, src2, dst); + + put_cache_entry(dst, src1); + put_cache_entry(next_dst, src2); + + last_destination_offset = a.offset(); + } + + /* Works like LDR, but looks in the cache first. */ + void ldr_cached(arm::Gp dst, arm::Mem mem) { + arm::Gp cached_reg = find_cache(mem); + + if (cached_reg.isValid()) { + /* This memory location is cached. */ + if (cached_reg == dst) { + comment("skipped fetching of BEAM register"); } else { - /* Not cached. Load and preserve the cache. */ - a.ldr(dst, mem); - preserve__cache(dst); + comment("simplified fetching of BEAM register"); + a.mov(dst, cached_reg); + invalidate_cache(dst); + last_destination_offset = a.offset(); } } else { - /* The cache is invalid. */ + /* Not cached. Load and update cache. */ a.ldr(dst, mem); + invalidate_cache(dst); + put_cache_entry(mem, dst); + last_destination_offset = a.offset(); + } + } + + void preserve_cache(std::function generate, + std::initializer_list clobber = {}) { + bool valid_cache = is_cache_valid(); + + generate(); + + if (valid_cache) { + for (const auto ® : clobber) { + invalidate_cache(reg); + } + last_destination_offset = a.offset(); - last_destination_to1 = mem; - last_destination_from1 = dst; - last_destination_to2 = arm::Mem(); } } @@ -1058,21 +1112,19 @@ class BeamModuleAssembler : public BeamAssembler, } void mov_preserve_cache(arm::Gp dst, arm::Gp src) { - if (a.offset() == last_destination_offset) { - a.mov(dst, src); - preserve__cache(dst); - } else { - a.mov(dst, src); - } + preserve_cache( + [&]() { + a.mov(dst, src); + }, + {dst}); } - void mov_imm_preserve_cache(arm::Gp dst, UWord value) { - if (a.offset() == last_destination_offset) { - mov_imm(dst, value); - preserve__cache(dst); - } else { - mov_imm(dst, value); - } + void untag_ptr_preserve_cache(arm::Gp dst, arm::Gp src) { + preserve_cache( + [&]() { + emit_untag_ptr(dst, src); + }, + {dst}); } arm::Mem embed_label(const Label &label, enum Displacement disp); @@ -1150,8 +1202,31 @@ class BeamModuleAssembler : public BeamAssembler, arm::Gp emit_call_fun(bool skip_box_test = false, bool skip_header_test = false); + void emit_is_cons(Label Fail, arm::Gp Src) { + preserve_cache([&]() { + BeamAssembler::emit_is_cons(Fail, Src); + }); + } + + void emit_is_not_cons(Label Fail, arm::Gp Src) { + preserve_cache([&]() { + BeamAssembler::emit_is_not_cons(Fail, Src); + }); + } + + void emit_is_list(Label Fail, arm::Gp Src) { + preserve_cache([&]() { + a.tst(Src, imm(_TAG_PRIMARY_MASK - TAG_PRIMARY_LIST)); + a.mov(SUPER_TMP, NIL); + a.ccmp(Src, SUPER_TMP, imm(NZCV::kEqual), imm(arm::CondCode::kNE)); + a.b_ne(Fail); + }); + } + void emit_is_boxed(Label Fail, arm::Gp Src) { - BeamAssembler::emit_is_boxed(Fail, Src); + preserve_cache([&]() { + BeamAssembler::emit_is_boxed(Fail, Src); + }); } void emit_is_boxed(Label Fail, const ArgVal &Arg, arm::Gp Src) { @@ -1160,7 +1235,9 @@ class BeamModuleAssembler : public BeamAssembler, return; } - BeamAssembler::emit_is_boxed(Fail, Src); + preserve_cache([&]() { + BeamAssembler::emit_is_boxed(Fail, Src); + }); } /* Copies `count` words from the address at `from`, to the address at `to`. @@ -1446,7 +1523,11 @@ class BeamModuleAssembler : public BeamAssembler, ASSERT(tmp.isGpX()); if (arg.isLiteral()) { - a.ldr(tmp, embed_constant(arg, disp32K)); + preserve_cache( + [&]() { + a.ldr(tmp, embed_constant(arg, disp32K)); + }, + {tmp}); return Variable(tmp); } else if (arg.isRegister()) { if (isRegisterBacked(arg)) { @@ -1465,16 +1546,55 @@ class BeamModuleAssembler : public BeamAssembler, : arg.as().get(); if (Support::isIntOrUInt32(val)) { - mov_imm_preserve_cache(tmp, val); + preserve_cache( + [&]() { + mov_imm(tmp, val); + }, + {tmp}); return Variable(tmp); } } - a.ldr(tmp, embed_constant(arg, disp32K)); + preserve_cache( + [&]() { + a.ldr(tmp, embed_constant(arg, disp32K)); + }, + {tmp}); return Variable(tmp); } } + /* + * Load the argument into ANY register, using the + * cache to avoid reloading the value. + * + * Because it is not possible to predict into which register + * the value will end up, the following code is UNSAFE: + * + * auto src = load_source(Src); + * a.tst(src.reg, ...); + * a.mov(TMP2, NIL); + * a.ccmp(src.reg, TMP2, ..., ...); + * + * If the value of Src happens to end up in TMP2, it will be + * overwritten before its second use. + * + * Basically, the only safe way to use this function is when the + * register is used immediately and only once. For example: + * + * a.and_(TMP1, load_source(Src), imm(...)); + * a.cmp(TMP1, imm(...)); + */ + Variable load_source(const ArgVal &arg) { + arm::Gp cached_reg = find_cache(getArgRef(arg)); + + if (cached_reg.isValid()) { + return load_source(arg, cached_reg); + } else { + return load_source(arg, TMP1); + } + } + auto load_sources(const ArgVal &Src1, arm::Gp tmp1, const ArgVal &Src2, @@ -1559,6 +1679,8 @@ class BeamModuleAssembler : public BeamAssembler, void flush_var(const Variable &to) { if (to.mem.hasBase()) { str_cache(to.reg, to.mem); + } else { + invalidate_cache(to.reg); } } @@ -1707,11 +1829,11 @@ class BeamModuleAssembler : public BeamAssembler, ASSERT(gp.isGpX()); if (abs_offset <= sizeof(Eterm) * MAX_LDR_STR_DISPLACEMENT) { - bool valid_cache = is_cache_valid(); - a.ldr(gp, mem); - if (valid_cache) { - preserve__cache(gp); - } + preserve_cache( + [&]() { + a.ldr(gp, mem); + }, + {gp}); } else { add(SUPER_TMP, arm::GpX(mem.baseId()), offset); a.ldr(gp, arm::Mem(SUPER_TMP)); @@ -1751,12 +1873,11 @@ class BeamModuleAssembler : public BeamAssembler, ASSERT(gp1 != gp2); if (abs_offset <= sizeof(Eterm) * MAX_LDP_STP_DISPLACEMENT) { - bool valid_cache = is_cache_valid(); - a.ldp(gp1, gp2, mem); - if (valid_cache) { - preserve__cache(gp1); - preserve__cache(gp2); - } + preserve_cache( + [&]() { + a.ldp(gp1, gp2, mem); + }, + {gp1, gp2}); } else if (abs_offset < sizeof(Eterm) * MAX_LDR_STR_DISPLACEMENT) { /* Note that we used `<` instead of `<=`, as we're loading two * elements rather than one. */ diff --git a/erts/emulator/beam/jit/arm/instr_arith.cpp b/erts/emulator/beam/jit/arm/instr_arith.cpp index 7e0ccf9d240c..d8ca86c4b192 100644 --- a/erts/emulator/beam/jit/arm/instr_arith.cpp +++ b/erts/emulator/beam/jit/arm/instr_arith.cpp @@ -152,7 +152,7 @@ void BeamModuleAssembler::emit_i_plus(const ArgLabel &Fail, if (always_small(LHS) && always_small(RHS) && is_small_result) { auto dst = init_destination(Dst, ARG1); if (rhs_is_arm_literal) { - auto lhs = load_source(LHS, ARG2); + auto lhs = load_source(LHS); Uint cleared_tag = RHS.as().get() & ~_TAG_IMMED1_MASK; comment("add small constant without overflow check"); a.add(dst.reg, lhs.reg, imm(cleared_tag)); @@ -341,7 +341,7 @@ void BeamModuleAssembler::emit_i_minus(const ArgLabel &Fail, if (always_small(LHS) && always_small(RHS) && is_small_result) { auto dst = init_destination(Dst, ARG1); if (rhs_is_arm_literal) { - auto lhs = load_source(LHS, ARG2); + auto lhs = load_source(LHS); Uint cleared_tag = RHS.as().get() & ~_TAG_IMMED1_MASK; comment("subtract small constant without overflow check"); a.sub(dst.reg, lhs.reg, imm(cleared_tag)); @@ -1175,7 +1175,7 @@ void BeamModuleAssembler::emit_i_band(const ArgLabel &Fail, &ignore)) { comment("skipped test for small operands since they are always " "small"); - auto lhs = load_source(LHS, ARG2); + auto lhs = load_source(LHS); auto dst = init_destination(Dst, ARG1); /* TAG & TAG = TAG, so we don't need to tag it again. */ @@ -1269,7 +1269,7 @@ void BeamModuleAssembler::emit_i_bor(const ArgLabel &Fail, if (a64::Utils::encodeLogicalImm(rhs, 64, &ignore)) { comment("skipped test for small operands since they are always " "small"); - auto lhs = load_source(LHS, ARG2); + auto lhs = load_source(LHS); auto dst = init_destination(Dst, ARG1); a.orr(dst.reg, lhs.reg, rhs); @@ -1624,7 +1624,7 @@ void BeamModuleAssembler::emit_i_bsl(const ArgLabel &Fail, if (is_bsl_small(LHS, RHS)) { comment("skipped tests because operands and result are always small"); if (RHS.isSmall()) { - auto lhs = load_source(LHS, ARG2); + auto lhs = load_source(LHS); a.and_(TMP1, lhs.reg, imm(~_TAG_IMMED1_MASK)); a.lsl(TMP1, TMP1, imm(RHS.as().getSigned())); } else { diff --git a/erts/emulator/beam/jit/arm/instr_bif.cpp b/erts/emulator/beam/jit/arm/instr_bif.cpp index 9cf133bcc705..09a799f0f44d 100644 --- a/erts/emulator/beam/jit/arm/instr_bif.cpp +++ b/erts/emulator/beam/jit/arm/instr_bif.cpp @@ -109,7 +109,7 @@ void BeamModuleAssembler::emit_i_bif1(const ArgSource &Src1, const ArgLabel &Fail, const ArgWord &Bif, const ArgRegister &Dst) { - auto src1 = load_source(Src1, TMP1); + auto src1 = load_source(Src1); a.str(src1.reg, getXRef(0)); @@ -169,7 +169,7 @@ void BeamModuleAssembler::emit_i_bif(const ArgLabel &Fail, void BeamModuleAssembler::emit_nofail_bif1(const ArgSource &Src1, const ArgWord &Bif, const ArgRegister &Dst) { - auto src1 = load_source(Src1, TMP1); + auto src1 = load_source(Src1); a.str(src1.reg, getXRef(0)); diff --git a/erts/emulator/beam/jit/arm/instr_bs.cpp b/erts/emulator/beam/jit/arm/instr_bs.cpp index 4c714ef4aa2f..0981726fc33f 100644 --- a/erts/emulator/beam/jit/arm/instr_bs.cpp +++ b/erts/emulator/beam/jit/arm/instr_bs.cpp @@ -632,7 +632,7 @@ void BeamModuleAssembler::emit_i_bs_match_string(const ArgRegister &Ctx, void BeamModuleAssembler::emit_i_bs_get_position(const ArgRegister &Ctx, const ArgRegister &Dst) { const int start_offset = offsetof(ErlSubBits, start); - auto ctx_reg = load_source(Ctx, TMP1); + auto ctx_reg = load_source(Ctx); auto dst_reg = init_destination(Dst, TMP2); /* Match contexts can never be literals, so we can skip clearing literal diff --git a/erts/emulator/beam/jit/arm/instr_common.cpp b/erts/emulator/beam/jit/arm/instr_common.cpp index 57e0b3ad8af4..dd4f2c650e03 100644 --- a/erts/emulator/beam/jit/arm/instr_common.cpp +++ b/erts/emulator/beam/jit/arm/instr_common.cpp @@ -272,29 +272,31 @@ void BeamModuleAssembler::emit_continue_exit() { void BeamModuleAssembler::emit_get_list(const ArgRegister &Src, const ArgRegister &Hd, const ArgRegister &Tl) { - auto src = load_source(Src, TMP1); + auto src = load_source(Src); + + /* The `ldp` instruction does not accept a negative offset, so we + * need to get rid of tag bits beforehand. */ + untag_ptr_preserve_cache(TMP1, src.reg); + auto hd = init_destination(Hd, TMP2); auto tl = init_destination(Tl, TMP3); - arm::Gp cons_ptr = emit_ptr_val(TMP1, src.reg); - /* The `ldp` instruction does not accept a negative offset, so we - * will need subtract the LIST tag beforehand. (This also nicely - * take care of the potential overwriting issue when Src == Hd.) */ - a.sub(TMP1, cons_ptr, imm(TAG_PRIMARY_LIST)); if (hd.reg == tl.reg) { /* ldp with two identical registers is an illegal - * instruction. Produce the same result at the interpreter. */ + * instruction. Produce the same result as the interpreter. */ a.ldr(tl.reg, arm::Mem(TMP1, sizeof(Eterm))); flush_var(tl); } else { - a.ldp(hd.reg, tl.reg, arm::Mem(TMP1)); + preserve_cache([&]() { + a.ldp(hd.reg, tl.reg, arm::Mem(TMP1)); + }); flush_vars(hd, tl); } } void BeamModuleAssembler::emit_get_hd(const ArgRegister &Src, const ArgRegister &Hd) { - auto src = load_source(Src, TMP1); + auto src = load_source(Src); auto hd = init_destination(Hd, TMP2); arm::Gp cons_ptr = emit_ptr_val(TMP1, src.reg); @@ -304,7 +306,7 @@ void BeamModuleAssembler::emit_get_hd(const ArgRegister &Src, void BeamModuleAssembler::emit_get_tl(const ArgRegister &Src, const ArgRegister &Tl) { - auto src = load_source(Src, TMP1); + auto src = load_source(Src); auto tl = init_destination(Tl, TMP2); arm::Gp cons_ptr = emit_ptr_val(TMP1, src.reg); @@ -344,8 +346,9 @@ void BeamModuleAssembler::emit_i_get_hash(const ArgConstant &Src, /* Store the untagged pointer to a tuple in ARG1. */ void BeamModuleAssembler::emit_load_tuple_ptr(const ArgSource &Src) { - auto src = load_source(Src, ARG1); - emit_untag_ptr(ARG1, src.reg); + auto src = load_source(Src); + + untag_ptr_preserve_cache(ARG1, src.reg); } #ifdef DEBUG @@ -711,10 +714,14 @@ void BeamModuleAssembler::emit_put_list(const ArgSource &Hd, const ArgSource &Tl, const ArgRegister &Dst) { auto [hd, tl] = load_sources(Hd, TMP1, Tl, TMP2); + auto hd_reg = hd.reg; + auto tl_reg = tl.reg; auto dst = init_destination(Dst, TMP3); - a.stp(hd.reg, tl.reg, arm::Mem(HTOP).post(sizeof(Eterm[2]))); - a.sub(dst.reg, HTOP, imm(sizeof(Eterm[2]) - TAG_PRIMARY_LIST)); + preserve_cache([&]() { + a.stp(hd_reg, tl_reg, arm::Mem(HTOP).post(sizeof(Eterm[2]))); + a.sub(dst.reg, HTOP, imm(sizeof(Eterm[2]) - TAG_PRIMARY_LIST)); + }); flush_var(dst); } @@ -1006,7 +1013,7 @@ void BeamModuleAssembler::emit_set_tuple_element(const ArgSource &Element, void BeamModuleAssembler::emit_is_nonempty_list(const ArgLabel &Fail, const ArgRegister &Src) { - auto list_ptr = load_source(Src, TMP1); + auto list_ptr = load_source(Src); emit_is_cons(resolve_beam_label(Fail, dispUnknown), list_ptr.reg); } @@ -1017,11 +1024,15 @@ void BeamModuleAssembler::emit_jump(const ArgLabel &Fail) { void BeamModuleAssembler::emit_is_atom(const ArgLabel &Fail, const ArgSource &Src) { - auto src = load_source(Src, TMP1); + auto src = load_source(Src); - a.and_(TMP1, src.reg, imm(_TAG_IMMED2_MASK)); - a.cmp(TMP1, imm(_TAG_IMMED2_ATOM)); - a.b_ne(resolve_beam_label(Fail, disp1MB)); + preserve_cache( + [&]() { + a.and_(TMP1, src.reg, imm(_TAG_IMMED2_MASK)); + a.cmp(TMP1, imm(_TAG_IMMED2_ATOM)); + a.b_ne(resolve_beam_label(Fail, disp1MB)); + }, + {TMP1}); } void BeamModuleAssembler::emit_is_boolean(const ArgLabel &Fail, @@ -1237,17 +1248,14 @@ void BeamModuleAssembler::emit_is_integer(const ArgLabel &Fail, void BeamModuleAssembler::emit_is_list(const ArgLabel &Fail, const ArgSource &Src) { - auto src = load_source(Src, TMP1); + auto src = load_source(Src); - a.tst(src.reg, imm(_TAG_PRIMARY_MASK - TAG_PRIMARY_LIST)); - a.mov(TMP2, NIL); - a.ccmp(src.reg, TMP2, imm(NZCV::kEqual), imm(arm::CondCode::kNE)); - a.b_ne(resolve_beam_label(Fail, disp1MB)); + emit_is_list(resolve_beam_label(Fail, dispUnknown), src.reg); } void BeamModuleAssembler::emit_is_map(const ArgLabel &Fail, const ArgSource &Src) { - auto src = load_source(Src, TMP1); + auto src = load_source(Src); emit_is_boxed(resolve_beam_label(Fail, dispUnknown), Src, src.reg); @@ -1256,23 +1264,29 @@ void BeamModuleAssembler::emit_is_map(const ArgLabel &Fail, if (masked_types(Src) == BeamTypeId::Map) { comment("skipped header test since we know it's a map when boxed"); } else { - arm::Gp boxed_ptr = emit_ptr_val(TMP1, src.reg); - a.ldur(TMP1, emit_boxed_val(boxed_ptr)); - a.and_(TMP1, TMP1, imm(_TAG_HEADER_MASK)); - a.cmp(TMP1, imm(_TAG_HEADER_MAP)); - a.b_ne(resolve_beam_label(Fail, disp1MB)); + preserve_cache( + [&]() { + arm::Gp boxed_ptr = emit_ptr_val(TMP3, src.reg); + a.ldur(TMP3, emit_boxed_val(boxed_ptr)); + a.and_(TMP3, TMP3, imm(_TAG_HEADER_MASK)); + a.cmp(TMP3, imm(_TAG_HEADER_MAP)); + a.b_ne(resolve_beam_label(Fail, disp1MB)); + }, + {TMP3}); } } void BeamModuleAssembler::emit_is_nil(const ArgLabel &Fail, const ArgRegister &Src) { - auto src = load_source(Src, TMP1); + auto src = load_source(Src); if (always_one_of(Src)) { emit_is_not_cons(resolve_beam_label(Fail, dispUnknown), src.reg); } else { - a.cmp(src.reg, imm(NIL)); - a.b_ne(resolve_beam_label(Fail, disp1MB)); + preserve_cache([&]() { + a.cmp(src.reg, imm(NIL)); + a.b_ne(resolve_beam_label(Fail, disp1MB)); + }); } } @@ -1723,8 +1737,10 @@ void BeamModuleAssembler::emit_is_eq_exact(const ArgLabel &Fail, comment("simplified check since one argument is an immediate"); } - cmp_arg(x.reg, Y); - a.b_ne(resolve_beam_label(Fail, disp1MB)); + preserve_cache([&]() { + cmp_arg(x.reg, Y); + a.b_ne(resolve_beam_label(Fail, disp1MB)); + }); return; } @@ -1875,8 +1891,10 @@ void BeamModuleAssembler::emit_is_ne_exact(const ArgLabel &Fail, comment("simplified check since one argument is an immediate"); } - cmp_arg(x.reg, Y); - a.b_eq(resolve_beam_label(Fail, disp1MB)); + preserve_cache([&]() { + cmp_arg(x.reg, Y); + a.b_eq(resolve_beam_label(Fail, disp1MB)); + }); return; } @@ -2568,23 +2586,27 @@ void BeamModuleAssembler::emit_is_ge_lt(ArgLabel const &Fail1, mov_arg(ARG2, A); mov_arg(ARG3, B); - a.and_(TMP1, src.reg, imm(_TAG_IMMED1_MASK)); - a.cmp(TMP1, imm(_TAG_IMMED1_SMALL)); - a.b_ne(generic); + preserve_cache( + [&]() { + a.and_(TMP1, src.reg, imm(_TAG_IMMED1_MASK)); + a.cmp(TMP1, imm(_TAG_IMMED1_SMALL)); + a.b_ne(generic); - a.cmp(src.reg, ARG2); - a.b_lt(resolve_beam_label(Fail1, disp1MB)); - a.cmp(ARG3, src.reg); - a.b_ge(resolve_beam_label(Fail2, disp1MB)); - a.b(next); + a.cmp(src.reg, ARG2); + a.b_lt(resolve_beam_label(Fail1, disp1MB)); + a.cmp(ARG3, src.reg); + a.b_ge(resolve_beam_label(Fail2, disp1MB)); + a.b(next); - a.bind(generic); - mov_var(ARG1, src); - fragment_call(ga->get_is_ge_lt_shared()); - a.b_lt(resolve_beam_label(Fail1, disp1MB)); - a.b_gt(resolve_beam_label(Fail2, disp1MB)); + a.bind(generic); + mov_var(ARG1, src); + fragment_call(ga->get_is_ge_lt_shared()); + a.b_lt(resolve_beam_label(Fail1, disp1MB)); + a.b_gt(resolve_beam_label(Fail2, disp1MB)); - a.bind(next); + a.bind(next); + }, + {TMP1}); } /* @@ -2604,10 +2626,15 @@ void BeamModuleAssembler::emit_is_ge_ge(ArgLabel const &Fail1, } auto src = load_source(Src, ARG1); - subs(TMP1, src.reg, A.as().get()); - a.b_lt(resolve_beam_label(Fail1, disp1MB)); - cmp(TMP1, B.as().get() - A.as().get()); - a.b_lo(resolve_beam_label(Fail2, disp1MB)); + + preserve_cache( + [&]() { + subs(TMP1, src.reg, A.as().get()); + a.b_lt(resolve_beam_label(Fail1, disp1MB)); + cmp(TMP1, B.as().get() - A.as().get()); + a.b_lo(resolve_beam_label(Fail2, disp1MB)); + }, + {TMP1}); } /* @@ -2624,16 +2651,20 @@ void BeamModuleAssembler::emit_is_int_in_range(ArgLabel const &Fail, ArgConstant const &Max) { auto src = load_source(Src, ARG1); - sub(TMP1, src.reg, Min.as().get()); + preserve_cache( + [&]() { + sub(TMP1, src.reg, Min.as().get()); - /* Since we have subtracted the (tagged) lower bound, the - * tag bits of the difference is 0 if and only if Src is - * a small. */ - ERTS_CT_ASSERT(_TAG_IMMED1_SMALL == _TAG_IMMED1_MASK); - a.tst(TMP1, imm(_TAG_IMMED1_MASK)); - a.b_ne(resolve_beam_label(Fail, disp1MB)); - cmp(TMP1, Max.as().get() - Min.as().get()); - a.b_hi(resolve_beam_label(Fail, disp1MB)); + /* Since we have subtracted the (tagged) lower bound, the tag + * bits of the difference is 0 if and only if Src is a + * small. */ + ERTS_CT_ASSERT(_TAG_IMMED1_SMALL == _TAG_IMMED1_MASK); + a.tst(TMP1, imm(_TAG_IMMED1_MASK)); + a.b_ne(resolve_beam_label(Fail, disp1MB)); + cmp(TMP1, Max.as().get() - Min.as().get()); + a.b_hi(resolve_beam_label(Fail, disp1MB)); + }, + {TMP1}); } /* @@ -2649,25 +2680,33 @@ void BeamModuleAssembler::emit_is_int_ge(ArgLabel const &Fail, comment("simplified small test since all other types are boxed"); emit_is_boxed(small, Src, src.reg); } else { - a.and_(TMP2, src.reg, imm(_TAG_IMMED1_MASK)); - a.cmp(TMP2, imm(_TAG_IMMED1_SMALL)); - a.b_eq(small); + preserve_cache( + [&]() { + a.and_(TMP2, src.reg, imm(_TAG_IMMED1_MASK)); + a.cmp(TMP2, imm(_TAG_IMMED1_SMALL)); + a.b_eq(small); + }, + {TMP2}); emit_is_boxed(resolve_beam_label(Fail, dispUnknown), Src, TMP2); } - arm::Gp boxed_ptr = emit_ptr_val(TMP1, src.reg); - a.ldur(TMP1, emit_boxed_val(boxed_ptr)); - a.and_(TMP1, TMP1, imm(_TAG_HEADER_MASK)); - a.cmp(TMP1, imm(_TAG_HEADER_POS_BIG)); - a.b_ne(resolve_beam_label(Fail, disp1MB)); - a.b(next); + preserve_cache( + [&]() { + arm::Gp boxed_ptr = emit_ptr_val(TMP1, src.reg); + a.ldur(TMP1, emit_boxed_val(boxed_ptr)); + a.and_(TMP1, TMP1, imm(_TAG_HEADER_MASK)); + a.cmp(TMP1, imm(_TAG_HEADER_POS_BIG)); + a.b_ne(resolve_beam_label(Fail, disp1MB)); + a.b(next); - a.bind(small); - cmp(src.reg, Min.as().get()); - a.b_lt(resolve_beam_label(Fail, disp1MB)); + a.bind(small); + cmp(src.reg, Min.as().get()); + a.b_lt(resolve_beam_label(Fail, disp1MB)); - a.bind(next); + a.bind(next); + }, + {TMP1}); } void BeamModuleAssembler::emit_badmatch(const ArgSource &Src) { diff --git a/erts/emulator/beam/jit/arm/instr_guard_bifs.cpp b/erts/emulator/beam/jit/arm/instr_guard_bifs.cpp index 9bc3e540f914..5982e89a28ad 100644 --- a/erts/emulator/beam/jit/arm/instr_guard_bifs.cpp +++ b/erts/emulator/beam/jit/arm/instr_guard_bifs.cpp @@ -143,7 +143,7 @@ void BeamModuleAssembler::emit_cmp_immed_to_bool(arm::CondCode cc, const ArgSource &RHS, const ArgRegister &Dst) { if (RHS.isImmed()) { - auto lhs = load_source(LHS, TMP1); + auto lhs = load_source(LHS); cmp_arg(lhs.reg, RHS); } else { auto [lhs, rhs] = load_sources(LHS, TMP1, RHS, TMP2); diff --git a/erts/emulator/beam/jit/arm/instr_select.cpp b/erts/emulator/beam/jit/arm/instr_select.cpp index 5717b7a51860..fa931183d56c 100644 --- a/erts/emulator/beam/jit/arm/instr_select.cpp +++ b/erts/emulator/beam/jit/arm/instr_select.cpp @@ -256,7 +256,7 @@ void BeamModuleAssembler::emit_i_select_tuple_arity(const ArgRegister &Src, const ArgLabel &Fail, const ArgWord &Size, const Span &args) { - auto src = load_source(Src, TMP1); + auto src = load_source(Src); emit_is_boxed(resolve_beam_label(Fail, dispUnknown), Src, src.reg); diff --git a/erts/emulator/beam/jit/arm/ops.tab b/erts/emulator/beam/jit/arm/ops.tab index 210833dcdb7d..a5c98b27ca5e 100644 --- a/erts/emulator/beam/jit/arm/ops.tab +++ b/erts/emulator/beam/jit/arm/ops.tab @@ -163,8 +163,7 @@ is_number f s jump f # -# List matching instructions. The combination of test for a nonempty list -# followed by get_{list/hd/tl} are common, so we will optimize that. +# List matching instructions. # is_nonempty_list Fail nqia => jump Fail