From 04058d568e9459f748017f637b1e384234fe6007 Mon Sep 17 00:00:00 2001 From: Edd Barrett Date: Wed, 18 Sep 2024 09:52:43 +0100 Subject: [PATCH] O1 fixes. This change is a collection of fixes required for us to enable the -O1 AOT post-link pipeline in Yk. This is slightly different from just passing -O1, but it's a big step in the right direction. Probably the most significant change is the transition from llvm.experimental.stackamp to llvm.experimental.patchpoint for the control point. This is needed because we were tracking live values at a slightly incorrect point in time. We needed the register state immediately before the call to _ykrt_control_point, but *including* any register shuffling/spills required due to the call ABI. The lua_array_len.c test was removed because we now run enough optimisations for LLVM to statically detect UB in it. Hardly surprising since this was derived using cvise. We can't handle poison values in the JIT and even if we could, I doubt the test is a good one any more. Co-authored-by: Lukas Diekmann --- bin/yk-config | 6 +- tests/c/aot_debuginfo.c | 2 +- tests/c/lua_array_len.c | 89 --------------------- tests/c/phi3.c | 1 - ykcapi/src/lib.rs | 22 ++++- ykllvm | 2 +- ykrt/src/compile/jitc_yk/aot_ir.rs | 9 ++- ykrt/src/compile/jitc_yk/codegen/x64/mod.rs | 20 +++++ ykrt/src/compile/jitc_yk/trace_builder.rs | 8 +- ykrt/src/mt.rs | 14 +++- yksmp/src/lib.rs | 1 + 11 files changed, 70 insertions(+), 104 deletions(-) delete mode 100644 tests/c/lua_array_len.c diff --git a/bin/yk-config b/bin/yk-config index c6c2d65e4..3d0d4f7b1 100755 --- a/bin/yk-config +++ b/bin/yk-config @@ -44,7 +44,9 @@ set_aot_pipeline() { PRELINK_PASSES_AO0="" POSTLINK_PASSES_AO0="instcombine" PRELINK_PASSES_AO1="" -POSTLINK_PASSES_AO1="instcombine,mem2reg,instcombine" +# The whole -O1 pipeline, as reported by `opt -O1 -print-pipeline-passes +# /dev/null`. +POSTLINK_PASSES_AO1="annotation2metadata,forceattrs,inferattrs,coro-early,function(lower-expect,simplifycfg,sroa,early-cse<>),openmp-opt,ipsccp,called-value-propagation,globalopt,function(mem2reg,instcombine,simplifycfg),require,function(invalidate),require,cgscc(devirt<4>(inline,inline,function-attrs,function(sroa,early-cse,simplifycfg,instcombine,libcalls-shrinkwrap,simplifycfg,reassociate,loop-mssa(loop-instsimplify,loop-simplifycfg,licm,loop-rotate,licm,simple-loop-unswitch),simplifycfg,instcombine,loop(loop-idiom,indvars,loop-deletion,loop-unroll-full),sroa,memcpyopt,sccp,bdce,instcombine,coro-elide,adce,simplifycfg,instcombine),function-attrs,function(require),coro-split)),deadargelim,coro-cleanup,globalopt,globaldce,elim-avail-extern,rpo-function-attrs,recompute-globalsaa,function(float2int,lower-constant-intrinsics,loop(loop-rotate,loop-deletion),loop-distribute,inject-tli-mappings,loop-vectorize,infer-alignment,loop-load-elim,instcombine,simplifycfg,vector-combine,instcombine,loop-unroll,transform-warning,sroa,infer-alignment,instcombine,loop-mssa(licm),alignment-from-assumptions,loop-sink,instsimplify,div-rem-pairs,tailcallelim,simplifycfg),globaldce,constmerge,cg-profile,rel-lookup-table-converter,function(annotation-remarks),verify" # Initialise the AOT pipeline to level 0 unless YKB_AOT_OPTLEVEL is set. set_aot_pipeline 0 @@ -159,6 +161,8 @@ handle_arg() { OUTPUT="${OUTPUT} -Wl,--mllvm=--yk-shadow-stack" # Encode additional locations in stackmaps. OUTPUT="${OUTPUT} -Wl,--mllvm=--yk-stackmap-add-locs" + # Don't optimise functions by changing their calling convention. + OUTPUT="${OUTPUT} -Wl,--mllvm=--yk-dont-opt-func-abi" # Use software-tracer pass if [ "${YKB_TRACER}" = "swt" ]; then OUTPUT="${OUTPUT} -Wl,--mllvm=--yk-basicblock-tracer" diff --git a/tests/c/aot_debuginfo.c b/tests/c/aot_debuginfo.c index a1a5787f1..202da49c2 100644 --- a/tests/c/aot_debuginfo.c +++ b/tests/c/aot_debuginfo.c @@ -24,7 +24,7 @@ // ... // # aot_debuginfo.c:{{_}}: yk_mt_control_point(mt, &loc); // ... -// call __ykrt_control_point(%{{_}}, %{{_}}, 1i64) [safepoint:... +// call llvm.experimental.patchpoint.void(0i64, 13i32, __ykrt_control_point... // ... // # aot_debuginfo.c:{{_}}: i--; // ... diff --git a/tests/c/lua_array_len.c b/tests/c/lua_array_len.c deleted file mode 100644 index d8292ebfe..000000000 --- a/tests/c/lua_array_len.c +++ /dev/null @@ -1,89 +0,0 @@ -// ## yk-config-env: YKB_AOT_OPTLEVEL=1 -// Run-time: -// env-var: YKD_LOG_IR=-:aot,jit-pre-opt -// env-var: YKD_SERIALISE_COMPILATION=1 -// env-var: YK_LOG=4 - -// This was reduced from yklua. -// -// In short a `ptradd %ptr, 0` was reducing to nothing in the tracebuilder and -// the AOT instruction was being associated with the last JIT instruction -// (which was nothing to do with the `ptr_add`!). -// -// This caused a runtime crash. -// -// I tried to reduce this to a test with a simple constant 0-byte offset field -// access, but I can't get ykllvm to emit a constant zero offset GEP when -// mem2reg is enabled. - -#include -#include -#include -#include -#include -#include - -typedef unsigned char lu_byte; -typedef long long lua_Integer; -typedef unsigned long long lua_Unsigned; - -typedef union Value { - lua_Integer i; -} Value; - -typedef struct TValue { - Value value_; lu_byte tt_; -} TValue; - -typedef union StackValue { - TValue val; - struct { - Value value_; lu_byte tt_; - unsigned short delta; - } tbclist; -} StackValue; - -typedef StackValue *StkId; -typedef struct Table {} Table; - -union GCUnion { - struct Table h; -}; - -__attribute__((noinline)) -lua_Unsigned luaH_getn(Table *t) { return 0; } - -__attribute__((noinline)) -void luaV_objlen(StkId ra, const TValue *rb) { - Table *h = &((union GCUnion *)rb)->h; - TValue *io = (TValue *) ra; - io->value_.i = luaH_getn(h); -} - -int main(int argc, char **argv) { - YkMT *mt = yk_mt_new(NULL); - yk_mt_hot_threshold_set(mt, 0); - YkLocation loc = yk_location_new(); - - int i = 4; - StackValue sv; - sv.val.value_.i = 1; - TValue tv; - tv.value_.i = 1; - StkId ra = &sv; - TValue *rb = &tv; - NOOPT_VAL(loc); - NOOPT_VAL(i); - NOOPT_VAL(ra); - NOOPT_VAL(rb); - while (i > 0) { - yk_mt_control_point(mt, &loc); - luaV_objlen(ra, rb); - fprintf(stderr, "%d\n", i); - i--; - } - fprintf(stderr, "exit\n"); - yk_location_drop(loc); - yk_mt_shutdown(mt); - return (EXIT_SUCCESS); -} diff --git a/tests/c/phi3.c b/tests/c/phi3.c index 8a5d21c5c..3c7c8010a 100644 --- a/tests/c/phi3.c +++ b/tests/c/phi3.c @@ -1,4 +1,3 @@ -// ## yk-config-env: YKB_AOT_OPTLEVEL=1 // Run-time: // env-var: YKD_LOG_IR=-:aot,jit-pre-opt // env-var: YK_LOG=4 diff --git a/ykcapi/src/lib.rs b/ykcapi/src/lib.rs index 03511dc43..9698beba5 100644 --- a/ykcapi/src/lib.rs +++ b/ykcapi/src/lib.rs @@ -66,11 +66,25 @@ pub extern "C" fn __ykrt_control_point( // callee-saved registers for us (so we don't have to do it here). unsafe { std::arch::asm!( - // Push callee-saved registers to the stack as these may contain trace inputs (live + // Push all registers to the stack as these may contain trace inputs (live // variables) referenced by the control point's stackmap. + // + // We don't need to push and restore `rdx` since `smid` can never be a live value and + // thus won't be tracked by the stackmap. + // + // FIXME: In the future we want the control point to return naturally into the compiled + // trace (at the moment we just rip out the control point's stack), which means we then + // no longer need to recover callee-saved registers as the control point will do this + // for us. + "push rax", + "push rcx", "push rbx", "push rdi", "push rsi", + "push r8", + "push r9", + "push r10", + "push r11", "push r12", "push r13", "push r14", @@ -83,9 +97,15 @@ pub extern "C" fn __ykrt_control_point( "pop r14", "pop r13", "pop r12", + "pop r11", + "pop r10", + "pop r9", + "pop r8", "pop rsi", "pop rdi", "pop rbx", + "pop rcx", + "pop rax", "ret", options(noreturn) ); diff --git a/ykllvm b/ykllvm index cbe366d62..df8f25adb 160000 --- a/ykllvm +++ b/ykllvm @@ -1 +1 @@ -Subproject commit cbe366d62656d7259c36f22053726517b73d30ee +Subproject commit df8f25adb2ee7f5da6cd48f3d1ec528b770d08a5 diff --git a/ykrt/src/compile/jitc_yk/aot_ir.rs b/ykrt/src/compile/jitc_yk/aot_ir.rs index 689c8e036..00a08ec2a 100644 --- a/ykrt/src/compile/jitc_yk/aot_ir.rs +++ b/ykrt/src/compile/jitc_yk/aot_ir.rs @@ -45,8 +45,6 @@ const MAGIC: u32 = 0xedd5f00d; /// The version of the bytecode format. const FORMAT_VERSION: u32 = 0; -/// The symbol name of the control point function (after ykllvm has transformed it). -const CONTROL_POINT_NAME: &str = "__ykrt_control_point"; const LLVM_DEBUG_CALL_NAME: &str = "llvm.dbg.value"; /// A struct identifying a line in a source file. @@ -1059,8 +1057,13 @@ impl Inst { /// Returns whether `self` is a call to the control point. pub(crate) fn is_control_point(&self, m: &Module) -> bool { + // FIXME: We don't really expect any other patchpoints here, but it would be nice to check + // the third argument is actually the control point. This would require the ability to + // reconstitute a string from constant bytes though. match self { - Self::Call { callee, .. } => m.func(*callee).name == CONTROL_POINT_NAME, + Self::Call { callee, .. } => { + m.func(*callee).name == "llvm.experimental.patchpoint.void" + } _ => false, } } diff --git a/ykrt/src/compile/jitc_yk/codegen/x64/mod.rs b/ykrt/src/compile/jitc_yk/codegen/x64/mod.rs index eaeb83530..fa35e7955 100644 --- a/ykrt/src/compile/jitc_yk/codegen/x64/mod.rs +++ b/ykrt/src/compile/jitc_yk/codegen/x64/mod.rs @@ -695,6 +695,17 @@ impl<'a> Assemble<'a> { fn cg_loadtraceinput(&mut self, iidx: jit_ir::InstIdx, inst: &jit_ir::LoadTraceInputInst) { let m = match &self.m.tilocs()[usize::try_from(inst.locidx()).unwrap()] { + yksmp::Location::Register(0, ..) => { + VarLocation::Register(reg_alloc::Register::GP(Rq::RAX)) + } + yksmp::Location::Register(1, ..) => { + // The third argument to the control point is the stackmap ID and is passed by RDX. + // We don't anticipate this ever being a live variable. + unreachable!() + } + yksmp::Location::Register(2, ..) => { + VarLocation::Register(reg_alloc::Register::GP(Rq::RCX)) + } yksmp::Location::Register(3, ..) => { VarLocation::Register(reg_alloc::Register::GP(Rq::RBX)) } @@ -704,6 +715,15 @@ impl<'a> Assemble<'a> { yksmp::Location::Register(5, ..) => { VarLocation::Register(reg_alloc::Register::GP(Rq::RDI)) } + yksmp::Location::Register(8, ..) => { + VarLocation::Register(reg_alloc::Register::GP(Rq::R8)) + } + yksmp::Location::Register(9, ..) => { + VarLocation::Register(reg_alloc::Register::GP(Rq::R9)) + } + yksmp::Location::Register(10, ..) => { + VarLocation::Register(reg_alloc::Register::GP(Rq::R10)) + } yksmp::Location::Register(12, ..) => { VarLocation::Register(reg_alloc::Register::GP(Rq::R12)) } diff --git a/ykrt/src/compile/jitc_yk/trace_builder.rs b/ykrt/src/compile/jitc_yk/trace_builder.rs index 1820d01a7..1e0be4c77 100644 --- a/ykrt/src/compile/jitc_yk/trace_builder.rs +++ b/ykrt/src/compile/jitc_yk/trace_builder.rs @@ -660,6 +660,11 @@ impl TraceBuilder { if AOT_MOD.func(*callee).name() == "__yk_trace_basicblock" { return Ok(()); } + + if inst.is_control_point(self.aot_mod) { + return Ok(()); + } + // Convert AOT args to JIT args. let mut jit_args = Vec::new(); for arg in args { @@ -722,9 +727,6 @@ impl TraceBuilder { } _ => panic!(), } - if inst.is_control_point(self.aot_mod) { - return Ok(()); - } let jit_func_decl_idx = self.handle_func(*callee)?; let inst = jit_ir::DirectCallInst::new(&mut self.jit_mod, jit_func_decl_idx, jit_args)?.into(); diff --git a/ykrt/src/mt.rs b/ykrt/src/mt.rs index 5d8071e1e..a57b3a8f6 100644 --- a/ykrt/src/mt.rs +++ b/ykrt/src/mt.rs @@ -700,16 +700,22 @@ unsafe extern "C" fn exec_trace( // Reset RSP to the end of the control point frame (this includes the registers we pushed // just before the control point) "mov rsp, rsi", - "sub rsp, 8", // Return address of control point call - "sub rsp, 56", // Registers pushed in naked cp call - //// Restore callee-saved registers which were pushed to the stack in __ykrt_control_point. + "sub rsp, 8", // Return address of control point call + "sub rsp, 104", // Registers pushed in naked cp call (includes alignment) + // Restore registers which were pushed to the stack in [ykcapi::__ykrt_control_point]. "pop r15", "pop r14", "pop r13", "pop r12", + "pop r11", + "pop r10", + "pop r9", + "pop r8", "pop rsi", - "pop rdi", // Don't overwrite `rdi` for now until we remove the first two args. + "pop rdi", "pop rbx", + "pop rcx", + "pop rax", // Call the trace function. "jmp rdx", "ret", diff --git a/yksmp/src/lib.rs b/yksmp/src/lib.rs index 3ac8b61e2..024dd51ea 100644 --- a/yksmp/src/lib.rs +++ b/yksmp/src/lib.rs @@ -285,6 +285,7 @@ impl StackMapParser<'_> { fn read_liveouts(&mut self, num: u16) { for _ in 0..num { let _dwreg = self.read_u16(); + let _reserved = self.read_u8(); let _size = self.read_u8(); } }