From 5c88b72ebc9979a1b856eefda25b9bc11fefaae1 Mon Sep 17 00:00:00 2001
From: Jameson Nash <vtjnash@gmail.com>
Date: Fri, 20 Dec 2024 20:24:09 +0000
Subject: [PATCH 1/5] teach jitlayers to use equivalent edges

Sometimes an edge (especially from precompile file, but sometimes from
inference) will specify a CodeInstance that does not need to be compiled
for its ABI and simply needs to be cloned to point to the existing copy
of it.
---
 src/gf.c          |  6 +---
 src/jitlayers.cpp | 79 +++++++++++++++++++++++++++++++++++++++++------
 2 files changed, 71 insertions(+), 14 deletions(-)

diff --git a/src/gf.c b/src/gf.c
index 52741094c0533..dcf1823f57746 100644
--- a/src/gf.c
+++ b/src/gf.c
@@ -344,7 +344,7 @@ jl_datatype_t *jl_mk_builtin_func(jl_datatype_t *dt, const char *name, jl_fptr_a
 jl_code_instance_t *jl_type_infer(jl_method_instance_t *mi, size_t world, uint8_t source_mode)
 {
     if (jl_typeinf_func == NULL)
-        return NULL;
+        return jl_method_inferred_with_abi(mi, world);
     jl_task_t *ct = jl_current_task;
     if (ct->reentrant_timing & 0b1000) {
         // We must avoid attempting to re-enter inference here
@@ -2857,9 +2857,6 @@ jl_code_instance_t *jl_compile_method_internal(jl_method_instance_t *mi, size_t
     }
 
     // Ok, compilation is enabled. We'll need to try to compile something (probably).
-    // Try to find a codeinst we have already inferred (e.g. while we were compiling
-    // something else).
-    codeinst = jl_method_inferred_with_abi(mi, world);
 
     // Everything from here on is considered (user facing) compile time
     uint64_t start = jl_typeinf_timing_begin();
@@ -2875,7 +2872,6 @@ jl_code_instance_t *jl_compile_method_internal(jl_method_instance_t *mi, size_t
         codeinst_old = jl_atomic_load_relaxed(&codeinst_old->next);
     }
 
-    // This codeinst hasn't been previously inferred do that now
     // jl_type_infer will internally do a cache lookup and jl_engine_reserve call
     // to synchronize this across threads
     if (!codeinst) {
diff --git a/src/jitlayers.cpp b/src/jitlayers.cpp
index bdb8298df41a6..139a5631d63c0 100644
--- a/src/jitlayers.cpp
+++ b/src/jitlayers.cpp
@@ -301,6 +301,29 @@ static void finish_params(Module *M, jl_codegen_params_t &params) JL_NOTSAFEPOIN
     }
 }
 
+// look for something with an egal ABI that is already in the JIT
+static jl_code_instance_t *jl_method_compiled_egal(jl_code_instance_t *ci JL_PROPAGATES_ROOT) JL_NOTSAFEPOINT
+{
+    jl_method_instance_t *mi = ci->def;
+    jl_value_t *owner = ci->owner;
+    jl_value_t *rettype = ci->rettype;
+    size_t min_world = jl_atomic_load_relaxed(&ci->min_world);
+    size_t max_world = jl_atomic_load_relaxed(&ci->max_world);
+    jl_code_instance_t *codeinst = jl_atomic_load_relaxed(&mi->cache);
+    while (codeinst) {
+        if (codeinst != ci &&
+            jl_atomic_load_relaxed(&codeinst->inferred) != NULL &&
+            jl_atomic_load_relaxed(&codeinst->invoke) != NULL &&
+            jl_atomic_load_relaxed(&codeinst->min_world) <= min_world &&
+            jl_atomic_load_relaxed(&codeinst->max_world) >= max_world &&
+            jl_egal(codeinst->owner, owner) &&
+            jl_egal(codeinst->rettype, rettype)) {
+            return codeinst;
+        }
+        codeinst = jl_atomic_load_relaxed(&codeinst->next);
+    }
+    return codeinst;
+}
 
 static int jl_analyze_workqueue(jl_code_instance_t *callee, jl_codegen_params_t &params, bool forceall=false) JL_NOTSAFEPOINT_LEAVE JL_NOTSAFEPOINT_ENTER
 {
@@ -309,6 +332,7 @@ static int jl_analyze_workqueue(jl_code_instance_t *callee, jl_codegen_params_t
     std::swap(params.workqueue, edges);
     for (auto &it : edges) {
         jl_code_instance_t *codeinst = it.first;
+        JL_GC_PROMISE_ROOTED(codeinst);
         auto &proto = it.second;
         // try to emit code for this item from the workqueue
         StringRef invokeName = "";
@@ -321,7 +345,7 @@ static int jl_analyze_workqueue(jl_code_instance_t *callee, jl_codegen_params_t
         // But it must be consistent with the following invokenames lookup, which is protected by the engine_lock
         uint8_t specsigflags;
         void *fptr;
-        void jl_read_codeinst_invoke(jl_code_instance_t *ci, uint8_t *specsigflags, jl_callptr_t *invoke, void **specptr, int waitcompile) JL_NOTSAFEPOINT; // not a safepoint (or deadlock) in this file due to 0 parameter
+        void jl_read_codeinst_invoke(jl_code_instance_t *ci, uint8_t *specsigflags, jl_callptr_t *invoke, void **specptr, int waitcompile) JL_NOTSAFEPOINT; // declare it is not a safepoint (or deadlock) in this file due to 0 parameter
         jl_read_codeinst_invoke(codeinst, &specsigflags, &invoke, &fptr, 0);
         //if (specsig ? specsigflags & 0b1 : invoke == jl_fptr_args_addr)
         if (invoke == jl_fptr_args_addr) {
@@ -349,6 +373,40 @@ static int jl_analyze_workqueue(jl_code_instance_t *callee, jl_codegen_params_t
                 force = true;
             }
         }
+        if (preal_decl.empty()) {
+            // there may be an equivalent method already compiled (or at least registered with the JIT to compile), in which case we should be using that instead
+            jl_code_instance_t *compiled_ci = jl_method_compiled_egal(codeinst);
+            if (compiled_ci) {
+                codeinst = compiled_ci;
+                uint8_t specsigflags;
+                void *fptr;
+                jl_read_codeinst_invoke(codeinst, &specsigflags, &invoke, &fptr, 0);
+                //if (specsig ? specsigflags & 0b1 : invoke == jl_fptr_args_addr)
+                if (invoke == jl_fptr_args_addr) {
+                    preal_decl = jl_ExecutionEngine->getFunctionAtAddress((uintptr_t)fptr, invoke, codeinst);
+                }
+                else if (specsigflags & 0b1) {
+                    preal_decl = jl_ExecutionEngine->getFunctionAtAddress((uintptr_t)fptr, invoke, codeinst);
+                    preal_specsig = true;
+                }
+                if (preal_decl.empty()) {
+                    auto it = invokenames.find(codeinst);
+                    if (it != invokenames.end()) {
+                        auto &decls = it->second;
+                        invokeName = decls.functionObject;
+                        if (decls.functionObject == "jl_fptr_args") {
+                            preal_decl = decls.specFunctionObject;
+                            isedge = true;
+                        }
+                        else if (decls.functionObject != "jl_fptr_sparam" && decls.functionObject != "jl_f_opaque_closure_call") {
+                            preal_decl = decls.specFunctionObject;
+                            preal_specsig = true;
+                            isedge = true;
+                        }
+                    }
+                }
+            }
+        }
         if (!preal_decl.empty() || force) {
             // if we have a prototype emitted, compare it to what we emitted earlier
             Module *mod = proto.decl->getParent();
@@ -733,14 +791,17 @@ static void recursive_compile_graph(
     while (!workqueue.empty()) {
         auto this_code = workqueue.pop_back_val();
         if (Seen.insert(this_code).second) {
-            if (this_code != codeinst) {
-                JL_GC_PROMISE_ROOTED(this_code); // rooted transitively from following edges from original argument
-                jl_emit_codeinst_to_jit(this_code, nullptr); // contains safepoints
-            }
-            jl_unique_gcsafe_lock lock(engine_lock);
-            auto edges = complete_graph.find(this_code);
-            if (edges != complete_graph.end()) {
-                workqueue.append(edges->second);
+            jl_code_instance_t *compiled_ci = jl_method_compiled_egal(codeinst);
+            if (!compiled_ci) {
+                if (this_code != codeinst) {
+                    JL_GC_PROMISE_ROOTED(this_code); // rooted transitively from following edges from original argument
+                    jl_emit_codeinst_to_jit(this_code, nullptr); // contains safepoints
+                }
+                jl_unique_gcsafe_lock lock(engine_lock);
+                auto edges = complete_graph.find(this_code);
+                if (edges != complete_graph.end()) {
+                    workqueue.append(edges->second);
+                }
             }
         }
     }

From b6fbd8123af1f68fb8aa5f06551bd28b0fd1ee3a Mon Sep 17 00:00:00 2001
From: Jameson Nash <vtjnash@gmail.com>
Date: Wed, 18 Dec 2024 18:52:48 +0000
Subject: [PATCH 2/5] opaque_closure: fix data-race mistakes with reading
 fields by using standard helper function

---
 src/gf.c             |  1 +
 src/opaque_closure.c | 28 +++++++++++++++-------------
 2 files changed, 16 insertions(+), 13 deletions(-)

diff --git a/src/gf.c b/src/gf.c
index dcf1823f57746..8086d75329bef 100644
--- a/src/gf.c
+++ b/src/gf.c
@@ -2722,6 +2722,7 @@ void jl_read_codeinst_invoke(jl_code_instance_t *ci, uint8_t *specsigflags, jl_c
             initial_invoke = jl_atomic_load_acquire(&ci->invoke); // happens-before for subsequent read of fptr
         }
         void *fptr = jl_atomic_load_relaxed(&ci->specptr.fptr);
+        // TODO: if fptr is NULL, it may mean we read this too fast, and should have spun and waited for jl_compile_codeinst to finish
         if (initial_invoke == NULL || fptr == NULL) {
             *invoke = initial_invoke;
             *specptr = NULL;
diff --git a/src/opaque_closure.c b/src/opaque_closure.c
index e3334c037f5a9..8a9dcac30a4f8 100644
--- a/src/opaque_closure.c
+++ b/src/opaque_closure.c
@@ -67,12 +67,14 @@ static jl_opaque_closure_t *new_opaque_closure(jl_tupletype_t *argt, jl_value_t
         ci = jl_compile_method_internal(mi, world);
     }
 
-    jl_fptr_args_t invoke = (jl_fptr_args_t)jl_interpret_opaque_closure;
+    jl_fptr_args_t callptr = (jl_fptr_args_t)jl_interpret_opaque_closure;
     void *specptr = NULL;
 
     if (ci) {
-        invoke = (jl_fptr_args_t)jl_atomic_load_relaxed(&ci->invoke);
-        specptr = jl_atomic_load_relaxed(&ci->specptr.fptr);
+        uint8_t specsigflags;
+        jl_callptr_t invoke;
+        jl_read_codeinst_invoke(ci, &specsigflags, &invoke, &specptr, 1);
+        callptr = (jl_fptr_args_t)invoke; // codegen puts the object (or a jl_fptr_interpret_call token )here for us, even though it was the wrong type to put here
 
         selected_rt = ci->rettype;
         // If we're not allowed to generate a specsig with this, rt, fall
@@ -82,7 +84,7 @@ static jl_opaque_closure_t *new_opaque_closure(jl_tupletype_t *argt, jl_value_t
             // TODO: It would be better to try to get a specialization with the
             // correct rt check here (or we could codegen a wrapper).
             specptr = NULL; // this will force codegen of the unspecialized version
-            invoke = (jl_fptr_args_t)jl_interpret_opaque_closure;
+            callptr = (jl_fptr_args_t)jl_interpret_opaque_closure;
             jl_value_t *ts[2] = {rt_lb, (jl_value_t*)ci->rettype};
             selected_rt = jl_type_union(ts, 2);
         }
@@ -90,18 +92,18 @@ static jl_opaque_closure_t *new_opaque_closure(jl_tupletype_t *argt, jl_value_t
             // TODO: It would be better to try to get a specialization with the
             // correct rt check here (or we could codegen a wrapper).
             specptr = NULL; // this will force codegen of the unspecialized version
-            invoke = (jl_fptr_args_t)jl_interpret_opaque_closure;
+            callptr = (jl_fptr_args_t)jl_interpret_opaque_closure;
             selected_rt = jl_type_intersection(rt_ub, selected_rt);
         }
 
-        if (invoke == (jl_fptr_args_t) jl_fptr_interpret_call) {
-            invoke = (jl_fptr_args_t)jl_interpret_opaque_closure;
+        if (callptr == (jl_fptr_args_t)jl_fptr_interpret_call) {
+            callptr = (jl_fptr_args_t)jl_interpret_opaque_closure;
         }
-        else if (invoke == (jl_fptr_args_t)jl_fptr_args && specptr) {
-            invoke = (jl_fptr_args_t)specptr;
+        else if (callptr == (jl_fptr_args_t)jl_fptr_args && specptr != NULL) {
+            callptr = (jl_fptr_args_t)specptr;
         }
-        else if (invoke == (jl_fptr_args_t)jl_fptr_const_return) {
-            invoke = jl_isa(ci->rettype_const, selected_rt) ?
+        else if (callptr == (jl_fptr_args_t)jl_fptr_const_return) {
+            callptr = jl_isa(ci->rettype_const, selected_rt) ?
                 (jl_fptr_args_t)jl_fptr_const_opaque_closure :
                 (jl_fptr_args_t)jl_fptr_const_opaque_closure_typeerror;
             captures = ci->rettype_const;
@@ -117,14 +119,14 @@ static jl_opaque_closure_t *new_opaque_closure(jl_tupletype_t *argt, jl_value_t
         // OC wrapper methods are not world dependent and have no edges or other info
         ci = jl_get_method_inferred(mi_generic, selected_rt, 1, ~(size_t)0, NULL, NULL);
         if (!jl_atomic_load_acquire(&ci->invoke))
-            jl_compile_codeinst(ci); // confusing this actually calls jl_emit_oc_wrapper and never actually compiles ci (which would be impossible)
+            jl_compile_codeinst(ci); // confusing this actually calls jl_emit_oc_wrapper and never actually compiles ci (which would be impossible since it cannot have source)
         specptr = jl_atomic_load_relaxed(&ci->specptr.fptr);
     }
     jl_opaque_closure_t *oc = (jl_opaque_closure_t*)jl_gc_alloc(ct->ptls, sizeof(jl_opaque_closure_t), oc_type);
     oc->source = source;
     oc->captures = captures;
     oc->world = world;
-    oc->invoke = invoke;
+    oc->invoke = callptr;
     oc->specptr = specptr;
 
     JL_GC_POP();

From 835c8ac9aacdbb806695230aed875b4508bbf319 Mon Sep 17 00:00:00 2001
From: Jameson Nash <vtjnash@gmail.com>
Date: Wed, 18 Dec 2024 20:39:34 +0000
Subject: [PATCH 3/5] opaque_closure: fix world-age mistake in fallback path

This was failing the h_world_age test sometimes.
---
 src/codegen.cpp | 13 ++++++++++++-
 src/gf.c        | 12 ++++++++++++
 src/julia.h     |  1 +
 3 files changed, 25 insertions(+), 1 deletion(-)

diff --git a/src/codegen.cpp b/src/codegen.cpp
index 5bf7c74deedcb..ceaba9507e4c7 100644
--- a/src/codegen.cpp
+++ b/src/codegen.cpp
@@ -996,6 +996,15 @@ static const auto jlinvoke_func = new JuliaFunction<>{
             {AttributeSet(),
              Attributes(C, {Attribute::ReadOnly, Attribute::NoCapture})}); },
 };
+static const auto jlinvokeoc_func = new JuliaFunction<>{
+    XSTR(jl_invoke_oc),
+    get_func2_sig,
+    [](LLVMContext &C) { return AttributeList::get(C,
+            AttributeSet(),
+            Attributes(C, {Attribute::NonNull}),
+            {AttributeSet(),
+             Attributes(C, {Attribute::ReadOnly, Attribute::NoCapture})}); },
+};
 static const auto jlopaque_closure_call_func = new JuliaFunction<>{
     XSTR(jl_f_opaque_closure_call),
     get_func_sig,
@@ -7288,7 +7297,9 @@ Function *emit_tojlinvoke(jl_code_instance_t *codeinst, StringRef theFptrName, M
         theFarg = literal_pointer_val(ctx, (jl_value_t*)codeinst);
     }
     else {
-        theFunc = prepare_call(jlinvoke_func);
+        jl_method_instance_t *mi = codeinst->def;
+        bool is_opaque_closure = jl_is_method(mi->def.value) && mi->def.method->is_for_opaque_closure;
+        theFunc = prepare_call(is_opaque_closure ? jlinvokeoc_func : jlinvoke_func);
         theFarg = literal_pointer_val(ctx, (jl_value_t*)jl_get_ci_mi(codeinst));
     }
     theFarg = track_pjlvalue(ctx, theFarg);
diff --git a/src/gf.c b/src/gf.c
index 8086d75329bef..3a308ca6154fc 100644
--- a/src/gf.c
+++ b/src/gf.c
@@ -3373,6 +3373,18 @@ JL_DLLEXPORT jl_value_t *jl_invoke(jl_value_t *F, jl_value_t **args, uint32_t na
     return _jl_invoke(F, args, nargs, mfunc, world);
 }
 
+JL_DLLEXPORT jl_value_t *jl_invoke_oc(jl_value_t *F, jl_value_t **args, uint32_t nargs, jl_method_instance_t *mfunc)
+{
+    jl_opaque_closure_t *oc = (jl_opaque_closure_t*)F;
+    jl_task_t *ct = jl_current_task;
+    size_t last_age = ct->world_age;
+    size_t world = oc->world;
+    ct->world_age = world;
+    jl_value_t *ret = _jl_invoke(F, args, nargs, mfunc, world);
+    ct->world_age = last_age;
+    return ret;
+}
+
 STATIC_INLINE int sig_match_fast(jl_value_t *arg1t, jl_value_t **args, jl_value_t **sig, size_t n)
 {
     // NOTE: This function is a huge performance hot spot!!
diff --git a/src/julia.h b/src/julia.h
index a71f9db030274..9b8d090fe37cb 100644
--- a/src/julia.h
+++ b/src/julia.h
@@ -2237,6 +2237,7 @@ STATIC_INLINE int jl_vinfo_usedundef(uint8_t vi)
 
 JL_DLLEXPORT jl_value_t *jl_apply_generic(jl_value_t *F, jl_value_t **args, uint32_t nargs);
 JL_DLLEXPORT jl_value_t *jl_invoke(jl_value_t *F, jl_value_t **args, uint32_t nargs, jl_method_instance_t *meth);
+JL_DLLEXPORT jl_value_t *jl_invoke_oc(jl_value_t *F, jl_value_t **args, uint32_t nargs, jl_method_instance_t *meth);
 JL_DLLEXPORT int32_t jl_invoke_api(jl_code_instance_t *linfo);
 
 STATIC_INLINE jl_value_t *jl_apply(jl_value_t **args, uint32_t nargs)

From 59d74d46f8aeb4c826f2b2c65f36978589cdfafb Mon Sep 17 00:00:00 2001
From: Jameson Nash <vtjnash@gmail.com>
Date: Wed, 11 Dec 2024 16:33:46 +0000
Subject: [PATCH 4/5] inference,codegen: connect source directly to jit

This avoids unnecessary compression when running (not generating code).
While generating code, we continue the legacy behavior of storing
compressed code, since restarting from a ji without that is quite slow.
Eventually, we should also remove that code also once we have generated
the object file from it.

This replaces the defective SOURCE_MODE_FORCE_SOURCE option with a new
`typeinf_ext_toplevel` batch-mode interface for compilation which
returns all required source code. Only two options remain now:
SOURCE_MODE_NOT_REQUIRED :
    Require only that the IPO information (e.g. rettype and friends) is
    present.
SOURCE_MODE_FORCE_ABI :
    Require that the IPO information is present (for ABI computation)
    and that the returned CodeInstance can be invoked on the host target
    (preferably after inference, called directly, but perfectly
    acceptable for Base.Compiler to instead force the runtime to use a
    stub there or call into it with the interpreter instead by having
    failed to provide any code).

This replaces the awkward `jl_create_native` interface (which is now
just a shim for calling the new batch-mode `typeinf_ext_toplevel`) with
a simpler `jl_emit_native` API, which does not do any inference or other
callbacks, but simply is a batch-mode call to `jl_emit_codeinfo` and
the work to build the external wrapper around them for linkage.
---
 Compiler/src/bootstrap.jl    |  19 +-
 Compiler/src/tfuncs.jl       |  18 +-
 Compiler/src/typeinfer.jl    | 245 ++++++++++++++++----------
 Compiler/src/types.jl        |  13 +-
 Compiler/src/utilities.jl    |   3 +
 Compiler/test/abioverride.jl |  30 ++--
 base/reflection.jl           |   8 -
 src/aotcompile.cpp           | 332 +++++++++++++++--------------------
 src/ccall.cpp                |   8 +-
 src/codegen-stubs.c          |  20 ++-
 src/codegen.cpp              |  81 +--------
 src/gf.c                     |  60 ++++++-
 src/jitlayers.cpp            |  46 +++--
 src/jitlayers.h              |   8 +-
 src/jl_exported_funcs.inc    |   3 +-
 src/julia.h                  |   1 +
 src/julia_internal.h         |  17 +-
 src/precompile_utils.c       |  16 +-
 src/toplevel.c               |  16 +-
 test/precompile.jl           |  59 +++----
 20 files changed, 501 insertions(+), 502 deletions(-)

diff --git a/Compiler/src/bootstrap.jl b/Compiler/src/bootstrap.jl
index 475c53e317152..ea7510df313c7 100644
--- a/Compiler/src/bootstrap.jl
+++ b/Compiler/src/bootstrap.jl
@@ -16,9 +16,9 @@ function activate_codegen!()
 end
 
 function bootstrap!()
+    global bootstrapping_compiler = true
     let time() = ccall(:jl_clock_now, Float64, ())
         println("Compiling the compiler. This may take several minutes ...")
-        interp = NativeInterpreter()
 
         ssa_inlining_pass!_tt = Tuple{typeof(ssa_inlining_pass!), IRCode, InliningState{NativeInterpreter}, Bool}
         optimize_tt = Tuple{typeof(optimize), NativeInterpreter, OptimizationState{NativeInterpreter}, InferenceResult}
@@ -45,13 +45,15 @@ function bootstrap!()
             end
         end
         starttime = time()
+        methods = Any[]
+        world = get_world_counter()
         for f in fs
             if isa(f, DataType) && f.name === typename(Tuple)
                 tt = f
             else
                 tt = Tuple{typeof(f), Vararg{Any}}
             end
-            matches = _methods_by_ftype(tt, 10, get_world_counter())::Vector
+            matches = _methods_by_ftype(tt, 10, world)::Vector
             if isempty(matches)
                 println(stderr, "WARNING: no matching method found for `", tt, "`")
             else
@@ -62,14 +64,25 @@ function bootstrap!()
                     for i = 1:length(params)
                         params[i] = unwraptv(params[i])
                     end
-                    typeinf_type(interp, m.method, Tuple{params...}, m.sparams)
+                    mi = specialize_method(m.method, Tuple{params...}, m.sparams)
+                    #isa_compileable_sig(mi) || println(stderr, "WARNING: inferring `", mi, "` which isn't expected to be called.")
+                    push!(methods, mi)
                 end
             end
         end
+        codeinfos = typeinf_ext_toplevel(methods, [world], false)
+        for i = 1:2:length(codeinfos)
+            ci = codeinfos[i]::CodeInstance
+            src = codeinfos[i + 1]::CodeInfo
+            isa_compileable_sig(ci.def) || continue # println(stderr, "WARNING: compiling `", ci.def, "` which isn't expected to be called.")
+            ccall(:jl_add_codeinst_to_jit, Cvoid, (Any, Any), ci, src)
+        end
         endtime = time()
         println("Base.Compiler ──── ", sub_float(endtime,starttime), " seconds")
     end
     activate_codegen!()
+    global bootstrapping_compiler = false
+    nothing
 end
 
 function activate!(; reflection=true, codegen=false)
diff --git a/Compiler/src/tfuncs.jl b/Compiler/src/tfuncs.jl
index e51d43e5b2fe1..f0c793a4ae3b7 100644
--- a/Compiler/src/tfuncs.jl
+++ b/Compiler/src/tfuncs.jl
@@ -42,11 +42,10 @@ macro nospecs(ex)
         push!(names, arg)
     end
     @assert isexpr(body, :block)
-    if !isempty(names)
-        lin = first(body.args)::LineNumberNode
-        nospec = Expr(:macrocall, Symbol("@nospecialize"), lin, names...)
-        insert!(body.args, 2, nospec)
-    end
+    isempty(names) && throw(ArgumentError("no arguments for @nospec"))
+    lin = first(body.args)::LineNumberNode
+    nospec = Expr(:macrocall, GlobalRef(@__MODULE__, :var"@nospecialize"), lin, names...)
+    insert!(body.args, 2, nospec)
     return esc(ex)
 end
 
@@ -2115,7 +2114,7 @@ add_tfunc(memoryrefoffset, 1, 1, memoryrefoffset_tfunc, 5)
     return true
 end
 
-@nospecs function memoryref_elemtype(@nospecialize mem)
+@nospecs function memoryref_elemtype(mem)
     m = widenconst(mem)
     if !has_free_typevars(m) && m <: GenericMemoryRef
         m0 = m
@@ -2131,7 +2130,7 @@ end
     return Any
 end
 
-@nospecs function _memoryref_elemtype(@nospecialize mem)
+@nospecs function _memoryref_elemtype(mem)
     m = widenconst(mem)
     if !has_free_typevars(m) && m <: GenericMemoryRef
         m0 = m
@@ -2166,7 +2165,7 @@ end
 end
 
 # whether getindex for the elements can potentially throw UndefRef
-function array_type_undefable(@nospecialize(arytype))
+@nospecs function array_type_undefable(arytype)
     arytype = unwrap_unionall(arytype)
     if isa(arytype, Union)
         return array_type_undefable(arytype.a) || array_type_undefable(arytype.b)
@@ -2247,7 +2246,7 @@ end
     return boundscheck ⊑ Bool && memtype ⊑ GenericMemoryRef && order ⊑ Symbol
 end
 
-@nospecs function memorynew_nothrow(argtypes::Vector{Any})
+function memorynew_nothrow(argtypes::Vector{Any})
     if !(argtypes[1] isa Const && argtypes[2] isa Const)
         return false
     end
@@ -2263,6 +2262,7 @@ end
     overflows = checked_smul_int(len, elsz)[2]
     return !overflows
 end
+
 # Query whether the given builtin is guaranteed not to throw given the `argtypes`.
 # `argtypes` can be assumed not to contain varargs.
 function _builtin_nothrow(𝕃::AbstractLattice, @nospecialize(f::Builtin), argtypes::Vector{Any},
diff --git a/Compiler/src/typeinfer.jl b/Compiler/src/typeinfer.jl
index eba520a03ada1..45df4707e4caa 100644
--- a/Compiler/src/typeinfer.jl
+++ b/Compiler/src/typeinfer.jl
@@ -92,8 +92,7 @@ If set to `true`, record per-method-instance timings within type inference in th
 __set_measure_typeinf(onoff::Bool) = __measure_typeinf__[] = onoff
 const __measure_typeinf__ = RefValue{Bool}(false)
 
-function finish!(interp::AbstractInterpreter, caller::InferenceState;
-                 can_discard_trees::Bool=may_discard_trees(interp))
+function finish!(interp::AbstractInterpreter, caller::InferenceState)
     result = caller.result
     opt = result.src
     if opt isa OptimizationState
@@ -120,20 +119,28 @@ function finish!(interp::AbstractInterpreter, caller::InferenceState;
             store_backedges(ci, edges)
         end
         inferred_result = nothing
+        uncompressed = inferred_result
         relocatability = 0x1
         const_flag = is_result_constabi_eligible(result)
-        if !can_discard_trees || (is_cached(caller) && !const_flag)
+        discard_src = caller.cache_mode === CACHE_MODE_NULL || const_flag
+        if !discard_src
             inferred_result = transform_result_for_cache(interp, result)
             # TODO: do we want to augment edges here with any :invoke targets that we got from inlining (such that we didn't have a direct edge to it already)?
-            relocatability = 0x0
             if inferred_result isa CodeInfo
+                if may_compress(interp)
+                    nslots = length(inferred_result.slotflags)
+                    resize!(inferred_result.slottypes::Vector{Any}, nslots)
+                    resize!(inferred_result.slotnames, nslots)
+                end
                 di = inferred_result.debuginfo
                 uncompressed = inferred_result
-                inferred_result = maybe_compress_codeinfo(interp, result.linfo, inferred_result, can_discard_trees)
-                result.is_src_volatile |= uncompressed !== inferred_result
+                inferred_result = maybe_compress_codeinfo(interp, result.linfo, inferred_result)
+                result.is_src_volatile = false
             elseif ci.owner === nothing
                 # The global cache can only handle objects that codegen understands
                 inferred_result = nothing
+            else
+                relocatability = 0x0
             end
             if isa(inferred_result, String)
                 t = @_gc_preserve_begin inferred_result
@@ -141,7 +148,7 @@ function finish!(interp::AbstractInterpreter, caller::InferenceState;
                 @_gc_preserve_end t
             end
         end
-        # n.b. relocatability = isa(inferred_result, String) && inferred_result[end]
+        # n.b. relocatability = !isa(inferred_result, String) || inferred_result[end]
         if !@isdefined di
             di = DebugInfo(result.linfo)
         end
@@ -149,6 +156,19 @@ function finish!(interp::AbstractInterpreter, caller::InferenceState;
             ci, inferred_result, const_flag, first(result.valid_worlds), last(result.valid_worlds), encode_effects(result.ipo_effects),
             result.analysis_results, relocatability, di, edges)
         engine_reject(interp, ci)
+        if !discard_src && isdefined(interp, :codegen) && uncompressed isa CodeInfo
+            # record that the caller could use this result to generate code when required, if desired, to avoid repeating n^2 work
+            interp.codegen[ci] = uncompressed
+            if bootstrapping_compiler && inferred_result == nothing
+                # This is necessary to get decent bootstrapping performance
+                # when compiling the compiler to inject everything eagerly
+                # where codegen can start finding and using it right away
+                mi = result.linfo
+                if mi.def isa Method && isa_compileable_sig(mi)
+                    ccall(:jl_add_codeinst_to_jit, Cvoid, (Any, Any), ci, uncompressed)
+                end
+            end
+        end
     end
     return nothing
 end
@@ -223,19 +243,13 @@ end
 
 transform_result_for_cache(::AbstractInterpreter, result::InferenceResult) = result.src
 
-function maybe_compress_codeinfo(interp::AbstractInterpreter, mi::MethodInstance, ci::CodeInfo,
-                                 can_discard_trees::Bool=may_discard_trees(interp))
+function maybe_compress_codeinfo(interp::AbstractInterpreter, mi::MethodInstance, ci::CodeInfo)
     def = mi.def
     isa(def, Method) || return ci # don't compress toplevel code
-    cache_the_tree = true
-    if can_discard_trees
-        cache_the_tree = is_inlineable(ci) || isa_compileable_sig(mi.specTypes, mi.sparam_vals, def)
-    end
+    can_discard_trees = may_discard_trees(interp)
+    cache_the_tree = !can_discard_trees || is_inlineable(ci)
     if cache_the_tree
         if may_compress(interp)
-            nslots = length(ci.slotflags)
-            resize!(ci.slottypes::Vector{Any}, nslots)
-            resize!(ci.slotnames, nslots)
             return ccall(:jl_compress_ir, String, (Any, Any), def, ci)
         else
             return ci
@@ -476,7 +490,7 @@ function finishinfer!(me::InferenceState, interp::AbstractInterpreter)
             rettype_const = nothing
             const_flags = 0x0
         end
-        relocatability = 0x0
+        relocatability = 0x1
         di = nothing
         edges = empty_edges # `edges` will be updated within `finish!`
         ci = result.ci
@@ -484,10 +498,10 @@ function finishinfer!(me::InferenceState, interp::AbstractInterpreter)
             ci, widenconst(result_type), widenconst(result.exc_result), rettype_const, const_flags,
             first(result.valid_worlds), last(result.valid_worlds),
             encode_effects(result.ipo_effects), result.analysis_results, di, edges)
-        if is_cached(me)
+        if is_cached(me) # CACHE_MODE_GLOBAL
             cached_result = cache_result!(me.interp, result, ci)
             if !cached_result
-                me.cache_mode = CACHE_MODE_NULL
+                me.cache_mode = CACHE_MODE_VOLATILE
             end
         end
     end
@@ -705,7 +719,7 @@ function resolve_call_cycle!(interp::AbstractInterpreter, mi::MethodInstance, pa
     for frameid = reverse(1:length(frames))
         frame = frames[frameid]
         isa(frame, InferenceState) || break
-        uncached |= !is_cached(frame) # ensure we never add an uncached frame to a cycle
+        uncached |= !is_cached(frame) # ensure we never add a (globally) uncached frame to a cycle
         if is_same_frame(interp, mi, frame)
             if uncached
                 # our attempt to speculate into a constant call lead to an undesired self-cycle
@@ -796,6 +810,7 @@ function typeinf_edge(interp::AbstractInterpreter, method::Method, @nospecialize
     cache_mode = CACHE_MODE_GLOBAL # cache edge targets globally by default
     force_inline = is_stmt_inline(get_curr_ssaflag(caller))
     edge_ci = nothing
+    # check cache with SOURCE_MODE_NOT_REQUIRED source_mode
     let codeinst = get(code_cache(interp), mi, nothing)
         if codeinst isa CodeInstance # return existing rettype if the code is already inferred
             inferred = @atomic :monotonic codeinst.inferred
@@ -945,21 +960,6 @@ function codeinfo_for_const(interp::AbstractInterpreter, mi::MethodInstance, @no
     return tree
 end
 
-"""
-    codeinstance_for_const_with_code(interp::AbstractInterpreter, code::CodeInstance)
-
-Given a constabi `CodeInstance`, create another (uncached) CodeInstance that contains the dummy code created
-by [`codeinfo_for_const`](@ref) for use in reflection functions that require this. See [`codeinfo_for_const`](@ref) for
-more details.
-"""
-function codeinstance_for_const_with_code(interp::AbstractInterpreter, code::CodeInstance)
-    src = codeinfo_for_const(interp, code.def, code.rettype_const)
-    return CodeInstance(code.def, cache_owner(interp), code.rettype, code.exctype, code.rettype_const, src,
-        Int32(0x3), code.min_world, code.max_world,
-        code.ipo_purity_bits, code.analysis_results,
-        code.relocatability, src.debuginfo, src.edges)
-end
-
 result_is_constabi(interp::AbstractInterpreter, result::InferenceResult) =
     may_discard_trees(interp) && is_result_constabi_eligible(result)
 
@@ -1051,44 +1051,35 @@ it has constabi) or one that can be made so by compiling its `->inferred`
 field.
 
 N.B.: The `->inferred` field is volatile and the compiler may delete it.
-In such a case, it will first set the `invoke` field to a method that
-will block the thread until compilation is completed.
 """
 const SOURCE_MODE_ABI = 0x1
 
-"""
-    SOURCE_MODE_FORCE_SOURCE
-
-Indicates that inference must always produce source in the `->inferred` field.
-This may mean that inference will need to re-do inference (if the `->inferred`
-field was previously deleted by the JIT) or may need to synthesize source for
-other kinds of CodeInstances.
-
-N.B.: The same caching considerations as SOURCE_MODE_ABI apply.
-"""
-const SOURCE_MODE_FORCE_SOURCE = 0x2
-
-function ci_has_source(code::CodeInstance)
-    inf = @atomic :monotonic code.inferred
-    return code.owner === nothing ? (isa(inf, CodeInfo) || isa(inf, String)) : inf !== nothing
-end
-
 """
     ci_has_abi(code::CodeInstance)
 
 Determine whether this CodeInstance is something that could be invoked if we gave it
-to the runtime system (either because it already has an ->invoke ptr, or because it
-has source that could be compiled).
+to the runtime system (either because it already has an ->invoke ptr, or
+because it has source that could be compiled). Note that this information may
+be stale by the time the user see it, so the user will need to perform their
+own checks if they actually need the abi from it.
 """
 function ci_has_abi(code::CodeInstance)
-    ci_has_source(code) && return true
-    return code.invoke !== C_NULL
+    (@atomic :acquire code.invoke) !== C_NULL && return true
+    inf = @atomic :monotonic code.inferred
+    if code.owner === nothing ? (isa(inf, CodeInfo) || isa(inf, String)) : inf !== nothing
+        # interp.codegen[code] = maybe_uncompress(code, inf) # TODO: the correct way to ensure this information doesn't become stale would be to push it into the stable codegen cache
+        return true
+    end
+    return false
+end
+
+function ci_has_invoke(code::CodeInstance)
+    return (@atomic :monotonic code.invoke) !== C_NULL
 end
 
 function ci_meets_requirement(code::CodeInstance, source_mode::UInt8)
     source_mode == SOURCE_MODE_NOT_REQUIRED && return true
     source_mode == SOURCE_MODE_ABI && return ci_has_abi(code)
-    source_mode == SOURCE_MODE_FORCE_SOURCE && return ci_has_source(code)
     return false
 end
 
@@ -1098,11 +1089,6 @@ function typeinf_ext(interp::AbstractInterpreter, mi::MethodInstance, source_mod
     let code = get(code_cache(interp), mi, nothing)
         if code isa CodeInstance
             # see if this code already exists in the cache
-            if source_mode == SOURCE_MODE_FORCE_SOURCE && use_const_api(code)
-                code = codeinstance_for_const_with_code(interp, code)
-                ccall(:jl_typeinf_timing_end, Cvoid, (UInt64,), start_time)
-                return code
-            end
             if ci_meets_requirement(code, source_mode)
                 ccall(:jl_typeinf_timing_end, Cvoid, (UInt64,), start_time)
                 return code
@@ -1124,12 +1110,6 @@ function typeinf_ext(interp::AbstractInterpreter, mi::MethodInstance, source_mod
     let code = get(code_cache(interp), mi, nothing)
         if code isa CodeInstance
             # see if this code already exists in the cache
-            if source_mode == SOURCE_MODE_FORCE_SOURCE && use_const_api(code)
-                engine_reject(interp, ci)
-                code = codeinstance_for_const_with_code(interp, code)
-                ccall(:jl_typeinf_timing_end, Cvoid, (UInt64,), start_time)
-                return code
-            end
             if ci_meets_requirement(code, source_mode)
                 engine_reject(interp, ci)
                 ccall(:jl_typeinf_timing_end, Cvoid, (UInt64,), start_time)
@@ -1148,26 +1128,15 @@ function typeinf_ext(interp::AbstractInterpreter, mi::MethodInstance, source_mod
     ccall(:jl_typeinf_timing_end, Cvoid, (UInt64,), start_time)
 
     ci = result.ci # reload from result in case it changed
-    if source_mode == SOURCE_MODE_ABI && frame.cache_mode != CACHE_MODE_GLOBAL
-        # XXX: jl_type_infer somewhat ambiguously assumes this must be cached, while jl_ci_cache_lookup sort of ambiguously re-caches it
+    @assert frame.cache_mode != CACHE_MODE_NULL
+    @assert is_result_constabi_eligible(result) || (!isdefined(interp, :codegen) || haskey(interp.codegen, ci))
+    @assert is_result_constabi_eligible(result) == use_const_api(ci)
+    @assert isdefined(ci, :inferred) "interpreter did not fulfill our expectations"
+    if !is_cached(frame) && source_mode == SOURCE_MODE_ABI
+        # XXX: jl_type_infer somewhat ambiguously assumes this must be cached
         # XXX: this should be using the CI from the cache, if possible instead: haskey(cache, mi) && (ci = cache[mi])
-        @assert isdefined(ci, :inferred) "interpreter did not fulfill its requirements"
         code_cache(interp)[mi] = ci
     end
-    if source_mode == SOURCE_MODE_FORCE_SOURCE && use_const_api(ci)
-        # If the caller cares about the code and this is constabi, still use our synthesis function
-        # anyway, because we will have not finished inferring the code inside the CodeInstance once
-        # we realized it was constabi, but we want reflection to pretend that we did.
-        # XXX: the one user of this does not actually want this behavior, but it is required by the flag definition currently
-        ci = codeinstance_for_const_with_code(interp, ci)
-        @assert ci_meets_requirement(ci, source_mode)
-        return ci
-    end
-    if !ci_meets_requirement(ci, source_mode)
-        can_discard_trees = false
-        finish!(interp, frame; can_discard_trees) # redo finish! with the correct can_discard_trees parameter value
-        @assert ci_meets_requirement(ci, source_mode)
-    end
     return ci
 end
 
@@ -1212,10 +1181,104 @@ function typeinf_type(interp::AbstractInterpreter, mi::MethodInstance)
     return widenconst(ignorelimited(result.result))
 end
 
-# This is a bridge for the C code calling `jl_typeinf_func()`
-typeinf_ext_toplevel(mi::MethodInstance, world::UInt, source_mode::UInt8) = typeinf_ext_toplevel(NativeInterpreter(world), mi, source_mode)
-function typeinf_ext_toplevel(interp::AbstractInterpreter, mi::MethodInstance, source_mode::UInt8)
-    return typeinf_ext(interp, mi, source_mode)
+# collect a list of all code that is needed along with CodeInstance to codegen it fully
+function collectinvokes!(wq::Vector{CodeInstance}, ci::CodeInfo)
+    src = ci.code
+    for i = 1:length(src)
+        stmt = src[i]
+        isexpr(stmt, :(=)) && (stmt = stmt.args[2])
+        if isexpr(stmt, :invoke) || isexpr(stmt, :invoke_modify)
+            edge = stmt.args[1]
+            edge isa CodeInstance && isdefined(edge, :inferred) && push!(wq, edge)
+        end
+        # TODO: handle other StmtInfo like @cfunction and OpaqueClosure?
+    end
+end
+
+# This is a bridge for the C code calling `jl_typeinf_func()` on a single Method match
+function typeinf_ext_toplevel(mi::MethodInstance, world::UInt, source_mode::UInt8)
+    interp = NativeInterpreter(world)
+    ci = typeinf_ext(interp, mi, source_mode)
+    if source_mode == SOURCE_MODE_ABI && ci isa CodeInstance && !ci_has_invoke(ci)
+        inspected = IdSet{CodeInstance}()
+        tocompile = Vector{CodeInstance}()
+        push!(tocompile, ci)
+        while !isempty(tocompile)
+            # ci_has_real_invoke(ci) && return ci # optimization: cease looping if ci happens to get compiled (not just jl_fptr_wait_for_compiled, but fully jl_is_compiled_codeinst)
+            callee = pop!(tocompile)
+            ci_has_invoke(callee) && continue
+            callee in inspected && continue
+            src = get(interp.codegen, callee, nothing)
+            if !isa(src, CodeInfo)
+                src = @atomic :monotonic callee.inferred
+                if isa(src, String)
+                    src = _uncompressed_ir(callee, src)
+                end
+                if !isa(src, CodeInfo)
+                    newcallee = typeinf_ext(interp, callee.def, source_mode)
+                    if newcallee isa CodeInstance
+                        callee === ci && (ci = newcallee) # ci stopped meeting the requirements after typeinf_ext last checked, try again with newcallee
+                        push!(tocompile, newcallee)
+                    #else
+                    #    println("warning: could not get source code for ", callee.def)
+                    end
+                    continue
+                end
+            end
+            push!(inspected, callee)
+            collectinvokes!(tocompile, src)
+            ccall(:jl_add_codeinst_to_jit, Cvoid, (Any, Any), callee, src)
+        end
+    end
+    return ci
+end
+
+# This is a bridge for the C code calling `jl_typeinf_func()` on set of Method matches
+function typeinf_ext_toplevel(methods::Vector{Any}, worlds::Vector{UInt}, trim::Bool)
+    inspected = IdSet{CodeInstance}()
+    tocompile = Vector{CodeInstance}()
+    codeinfos = []
+    # first compute the ABIs of everything
+    for this_world in reverse(sort!(worlds))
+        interp = NativeInterpreter(this_world)
+        for i = 1:length(methods)
+            # each item in this list is either a MethodInstance indicating something
+            # to compile, or an svec(rettype, sig) describing a C-callable alias to create.
+            item = methods[i]
+            if item isa MethodInstance
+                # if this method is generally visible to the current compilation world,
+                # and this is either the primary world, or not applicable in the primary world
+                # then we want to compile and emit this
+                if item.def.primary_world <= this_world <= item.def.deleted_world
+                    ci = typeinf_ext(interp, item, SOURCE_MODE_NOT_REQUIRED)
+                    ci isa CodeInstance && !use_const_api(ci) && push!(tocompile, ci)
+                end
+            elseif item isa SimpleVector
+                push!(codeinfos, item[1]::Type)
+                push!(codeinfos, item[2]::Type)
+            end
+        end
+        while !isempty(tocompile)
+            callee = pop!(tocompile)
+            callee in inspected && continue
+            push!(inspected, callee)
+            # now make sure everything has source code, if desired
+            # TODO: typeinf_code could return something with different edges/ages (needing an update to callee), which we don't handle here
+            if use_const_api(callee)
+                src = codeinfo_for_const(interp, callee.def, code.rettype_const)
+            elseif haskey(interp.codegen, callee)
+                src = interp.codegen[callee]
+            else
+                src = typeinf_code(interp, callee.def, true)
+            end
+            if src isa CodeInfo
+                collectinvokes!(tocompile, src)
+                push!(codeinfos, callee)
+                push!(codeinfos, src)
+            end
+        end
+    end
+    return codeinfos
 end
 
 function return_type(@nospecialize(f), t::DataType) # this method has a special tfunc
diff --git a/Compiler/src/types.jl b/Compiler/src/types.jl
index 5669ec3175c9e..6ffb5402682f3 100644
--- a/Compiler/src/types.jl
+++ b/Compiler/src/types.jl
@@ -366,6 +366,7 @@ struct NativeInterpreter <: AbstractInterpreter
 
     # Cache of inference results for this particular interpreter
     inf_cache::Vector{InferenceResult}
+    codegen::IdDict{CodeInstance,CodeInfo}
 
     # Parameters for inference and optimization
     inf_params::InferenceParams
@@ -386,16 +387,8 @@ function NativeInterpreter(world::UInt = get_world_counter();
     @assert world <= curr_max_world
     method_table = CachedMethodTable(InternalMethodTable(world))
     inf_cache = Vector{InferenceResult}() # Initially empty cache
-    return NativeInterpreter(world, method_table, inf_cache, inf_params, opt_params)
-end
-
-function NativeInterpreter(interp::NativeInterpreter;
-                           world::UInt = interp.world,
-                           method_table::CachedMethodTable{InternalMethodTable} = interp.method_table,
-                           inf_cache::Vector{InferenceResult} = interp.inf_cache,
-                           inf_params::InferenceParams = interp.inf_params,
-                           opt_params::OptimizationParams = interp.opt_params)
-    return NativeInterpreter(world, method_table, inf_cache, inf_params, opt_params)
+    codegen = IdDict{CodeInstance,CodeInfo}()
+    return NativeInterpreter(world, method_table, inf_cache, codegen, inf_params, opt_params)
 end
 
 # Quickly and easily satisfy the AbstractInterpreter API contract
diff --git a/Compiler/src/utilities.jl b/Compiler/src/utilities.jl
index 196722f8bca33..c322d1062cea1 100644
--- a/Compiler/src/utilities.jl
+++ b/Compiler/src/utilities.jl
@@ -149,6 +149,9 @@ end
 isa_compileable_sig(@nospecialize(atype), sparams::SimpleVector, method::Method) =
     !iszero(ccall(:jl_isa_compileable_sig, Int32, (Any, Any, Any), atype, sparams, method))
 
+isa_compileable_sig(m::MethodInstance) = (def = m.def; !isa(def, Method) || isa_compileable_sig(m.specTypes, m.sparam_vals, def))
+isa_compileable_sig(m::ABIOverride) = false
+
 has_typevar(@nospecialize(t), v::TypeVar) = ccall(:jl_has_typevar, Cint, (Any, Any), t, v) != 0
 
 """
diff --git a/Compiler/test/abioverride.jl b/Compiler/test/abioverride.jl
index d35fa8876cf1c..e257074852099 100644
--- a/Compiler/test/abioverride.jl
+++ b/Compiler/test/abioverride.jl
@@ -12,19 +12,23 @@ struct SecondArgConstOverride
     arg2::Int
 end
 
-world = Base.tls_world_age()
-mi = Base.specialize_method(only(Base._methods_by_ftype(Tuple{typeof(myplus), Int, Int}, -1, world)))
-interp = Compiler.NativeInterpreter(world)
-ci = Compiler.typeinf_ext(interp, mi, Compiler.SOURCE_MODE_FORCE_SOURCE)
-
 function is_known_call(@nospecialize(x), @nospecialize(func), src::Core.CodeInfo)
     isexpr(x, :call) || return false
     ft = Compiler.argextype(x.args[1], src, Compiler.VarState[])
     return Compiler.singleton_type(ft) === func
 end
 
+
 # Construct a CodeInstance with an ABI override
-new_ci = let new_source = copy(Base.uncompressed_ir(ci))
+let world = Base.tls_world_age()
+    # Get some inferred source code to give to the compiler
+    # Do not look at a CodeInstance here, since those fields are only valid to
+    # use while attached to a cache, and are thus invalid to make copies of
+    # (since you'd have to have made the copy to insert into the cache before
+    # making the original CodeInstance to copy from, which is obviously
+    # rather temporally-challenged)
+    new_source = only(code_typed(myplus, (Int, Int)))[1]
+    mi = new_source.parent
     ## Sanity check
     @assert length(new_source.code) == 2
     add = new_source.code[1]
@@ -37,15 +41,19 @@ new_ci = let new_source = copy(Base.uncompressed_ir(ci))
     resize!(new_source.slotnames, 2)
     resize!(new_source.slotflags, 2)
 
-    # Construct the CodeInstance
-    new_ci = Core.CodeInstance(Core.ABIOverride(Tuple{typeof(myplus), Int}, mi),
-        SecondArgConstOverride(1), ci.rettype, ci.exctype, nothing, new_source,
-        Int32(0), ci.min_world, ci.max_world, ci.ipo_purity_bits, nothing, ci.relocatability, ci.debuginfo, ci.edges)
+    # Construct the CodeInstance from the modified CodeInfo data
+    global new_ci = Core.CodeInstance(Core.ABIOverride(Tuple{typeof(myplus), Int}, mi),
+        #=owner=#SecondArgConstOverride(1), new_source.rettype, Any#=new_source.exctype is missing=#,
+        #=inferred_const=#nothing, #=code=#nothing, #=const_flags=#Int32(0),
+        new_source.min_world, new_source.max_world, #=new_source.ipo_purity_bits is missing=#UInt32(0),
+      #=analysis_results=#nothing, #=not relocatable?=#UInt8(0), new_source.debuginfo, new_source.edges)
 
     # Poke the CI into the global cache
+    # This isn't necessary, but does conveniently give it the mandatory permanent GC-root before calling `invoke`
     ccall(:jl_mi_cache_insert, Cvoid, (Any, Any), mi, new_ci)
 
-    new_ci
+    # Poke the source code into the JIT for it
+    ccall(:jl_add_codeinst_to_jit, Cvoid, (Any, Any), new_ci, new_source)
 end
 
 @test contains(repr(new_ci), "ABI Overridden")
diff --git a/base/reflection.jl b/base/reflection.jl
index f7952ac7a78d7..f9c5dd9765533 100644
--- a/base/reflection.jl
+++ b/base/reflection.jl
@@ -132,14 +132,6 @@ uncompressed_ir(m::Method) = isdefined(m, :source) ? _uncompressed_ir(m) :
                              isdefined(m, :generator) ? error("Method is @generated; try `code_lowered` instead.") :
                              error("Code for this Method is not available.")
 
-function uncompressed_ir(ci::CodeInstance)
-    inferred = ci.inferred
-    isa(inferred, CodeInfo) && return inferred
-    isa(inferred, String) && return _uncompressed_ir(ci, inferred)
-    inferred === nothing && error("Inferred code was deleted.")
-    error(string("Unknown inferred code type ", typeof(inferred)))
-end
-
 # for backwards compat
 const uncompressed_ast = uncompressed_ir
 const _uncompressed_ast = _uncompressed_ir
diff --git a/src/aotcompile.cpp b/src/aotcompile.cpp
index b238c44c52676..0600106c41aa6 100644
--- a/src/aotcompile.cpp
+++ b/src/aotcompile.cpp
@@ -52,7 +52,6 @@ using namespace llvm;
 
 #define DEBUG_TYPE "julia_aotcompile"
 
-STATISTIC(CICacheLookups, "Number of codeinst cache lookups");
 STATISTIC(CreateNativeCalls, "Number of jl_create_native calls made");
 STATISTIC(CreateNativeMethods, "Number of methods compiled for jl_create_native");
 STATISTIC(CreateNativeMax, "Max number of methods compiled at once for jl_create_native");
@@ -307,37 +306,6 @@ static void makeSafeName(GlobalObject &G)
         G.setName(StringRef(SafeName.data(), SafeName.size()));
 }
 
-static jl_code_instance_t *jl_ci_cache_lookup(jl_method_instance_t *mi, size_t world, jl_codeinstance_lookup_t lookup)
-{
-    ++CICacheLookups;
-    jl_value_t *ci = lookup(mi, world, world);
-    JL_GC_PROMISE_ROOTED(ci);
-    jl_code_instance_t *codeinst = NULL;
-    if (ci != jl_nothing && jl_atomic_load_relaxed(&((jl_code_instance_t *)ci)->inferred) != jl_nothing) {
-        codeinst = (jl_code_instance_t*)ci;
-    }
-    else {
-        if (lookup != jl_rettype_inferred_addr) {
-            // XXX: This will corrupt and leak a lot of memory which may be very bad
-            jl_error("Refusing to automatically run type inference with custom cache lookup.");
-        }
-        else {
-            // XXX: SOURCE_MODE_ABI is wrong here (not sufficient)
-            codeinst = jl_type_infer(mi, world, SOURCE_MODE_ABI);
-            /* Even if this codeinst is ordinarily not cacheable, we need to force
-             * it into the cache here, since it was explicitly requested and is
-             * otherwise not reachable from anywhere in the system image.
-             */
-            if (codeinst && !jl_mi_cache_has_ci(mi, codeinst)) {
-                JL_GC_PUSH1(&codeinst);
-                jl_mi_cache_insert(mi, codeinst);
-                JL_GC_POP();
-            }
-        }
-    }
-    return codeinst;
-}
-
 namespace { // file-local namespace
 class egal_set {
 public:
@@ -432,7 +400,7 @@ static void aot_optimize_roots(jl_codegen_params_t &params, egal_set &method_roo
     }
 }
 
-static void compile_workqueue(jl_codegen_params_t &params, egal_set &method_roots, CompilationPolicy policy, jl_compiled_functions_t &compiled_functions)
+static void resolve_workqueue(jl_codegen_params_t &params, egal_set &method_roots, jl_compiled_functions_t &compiled_functions)
 {
     decltype(params.workqueue) workqueue;
     std::swap(params.workqueue, workqueue);
@@ -449,24 +417,6 @@ static void compile_workqueue(jl_codegen_params_t &params, egal_set &method_root
         bool preal_specsig = false;
         {
             auto it = compiled_functions.find(codeinst);
-            if (it == compiled_functions.end()) {
-                // Reinfer the function. The JIT came along and removed the inferred
-                // method body. See #34993
-                if ((policy != CompilationPolicy::Default || params.params->trim) &&
-                    jl_atomic_load_relaxed(&codeinst->inferred) == jl_nothing) {
-                    // XXX: SOURCE_MODE_FORCE_SOURCE is wrong here (neither sufficient nor necessary)
-                    codeinst = jl_type_infer(jl_get_ci_mi(codeinst), jl_atomic_load_relaxed(&codeinst->max_world), SOURCE_MODE_FORCE_SOURCE);
-                }
-                if (codeinst) {
-                    orc::ThreadSafeModule result_m =
-                        jl_create_ts_module(name_from_method_instance(jl_get_ci_mi(codeinst)),
-                            params.tsctx, params.DL, params.TargetTriple);
-                    auto decls = jl_emit_codeinst(result_m, codeinst, NULL, params);
-                    record_method_roots(method_roots, jl_get_ci_mi(codeinst));
-                    if (result_m)
-                        it = compiled_functions.insert(std::make_pair(codeinst, std::make_pair(std::move(result_m), std::move(decls)))).first;
-                }
-            }
             if (it != compiled_functions.end()) {
                 auto &decls = it->second.second;
                 invokeName = decls.functionObject;
@@ -478,6 +428,12 @@ static void compile_workqueue(jl_codegen_params_t &params, egal_set &method_root
                     preal_specsig = true;
                 }
             }
+            else if (params.params->trim) {
+                jl_safe_printf("warning: no code provided for function");
+                jl_(codeinst->def);
+                if (params.params->trim)
+                    abort();
+            }
         }
         // patch up the prototype we emitted earlier
         Module *mod = proto.decl->getParent();
@@ -485,7 +441,7 @@ static void compile_workqueue(jl_codegen_params_t &params, egal_set &method_root
         Function *pinvoke = nullptr;
         if (preal_decl.empty()) {
             if (invokeName.empty() && params.params->trim) {
-                errs() << "Bailed out to invoke when compiling:";
+                jl_safe_printf("warning: bailed out to invoke when compiling:");
                 jl_(codeinst->def);
                 abort();
             }
@@ -543,31 +499,109 @@ static void compile_workqueue(jl_codegen_params_t &params, egal_set &method_root
     JL_GC_POP();
 }
 
+
 // takes the running content that has collected in the shadow module and dump it to disk
-// this builds the object file portion of the sysimage files for fast startup, and can
+// this builds the object file portion of the sysimage files for fast startup
+// `_external_linkage` create linkages between pkgimages.
+extern "C" JL_DLLEXPORT_CODEGEN
+void *jl_create_native_impl(jl_array_t *methods, LLVMOrcThreadSafeModuleRef llvmmod, int _trim, int _external_linkage, size_t world)
+{
+    JL_TIMING(INFERENCE, INFERENCE);
+    auto ct = jl_current_task;
+    bool timed = (ct->reentrant_timing & 1) == 0;
+    if (timed)
+        ct->reentrant_timing |= 1;
+    uint64_t compiler_start_time = 0;
+    uint8_t measure_compile_time_enabled = jl_atomic_load_relaxed(&jl_measure_compile_time_enabled);
+    if (measure_compile_time_enabled)
+        compiler_start_time = jl_hrtime();
+
+    jl_cgparams_t cgparams = jl_default_cgparams;
+    cgparams.trim = _trim ? 1 : 0;
+    size_t compile_for[] = { jl_typeinf_world, world };
+    int compiler_world = 1;
+    if (_trim || compile_for[0] == 0)
+        compiler_world = 0;
+    jl_value_t **fargs;
+    JL_GC_PUSHARGS(fargs, 4);
+    jl_array_t *codeinfos = NULL;
+    if (jl_typeinf_func) {
+        fargs[0] = (jl_value_t*)jl_typeinf_func;
+        fargs[1] = (jl_value_t*)methods;
+#ifdef _P64
+        jl_value_t *jl_array_ulong_type = jl_array_uint64_type;
+#else
+        jl_value_t *jl_array_ulong_type = jl_array_uint32_type;
+#endif
+        jl_array_t *worlds = jl_alloc_array_1d(jl_array_ulong_type, 1 + compiler_world);
+        fargs[2] = (jl_value_t*)worlds;
+        jl_array_data(worlds, size_t)[0] = jl_typeinf_world;
+        jl_array_data(worlds, size_t)[compiler_world] = world; // might overwrite previous
+        fargs[3] = _trim ? jl_true : jl_false;
+        size_t last_age = ct->world_age;
+        ct->world_age = jl_typeinf_world;
+        codeinfos = (jl_array_t*)jl_apply(fargs, 4);
+        ct->world_age = last_age;
+        JL_TYPECHK(create_native, array_any, (jl_value_t*)codeinfos);
+    }
+    else {
+        // we could put a very simple generator here, but there is no reason to do that right now
+        jl_error("inference not available for generating compiled output");
+    }
+    fargs[0] = (jl_value_t*)codeinfos;
+    void *data = jl_emit_native(codeinfos, llvmmod, &cgparams, _external_linkage);
+
+    // move everything inside, now that we've merged everything
+    // (before adding the exported headers)
+    ((jl_native_code_desc_t*)data)->M.withModuleDo([&](Module &M) {
+        auto TT = Triple(M.getTargetTriple());
+        Function *juliapersonality_func = nullptr;
+        if (TT.isOSWindows() && TT.getArch() == Triple::x86_64) {
+            // setting the function personality enables stack unwinding and catching exceptions
+            // so make sure everything has something set
+            Type *T_int32 = Type::getInt32Ty(M.getContext());
+            juliapersonality_func = Function::Create(FunctionType::get(T_int32, true),
+                Function::ExternalLinkage, "__julia_personality", M);
+            juliapersonality_func->setDLLStorageClass(GlobalValue::DLLImportStorageClass);
+        }
+        for (GlobalObject &G : M.global_objects()) {
+            if (!G.isDeclaration()) {
+                G.setLinkage(GlobalValue::InternalLinkage);
+                G.setDSOLocal(true);
+                makeSafeName(G);
+                if (Function *F = dyn_cast<Function>(&G)) {
+                    if (juliapersonality_func) {
+                        // Add unwind exception personalities to functions to handle async exceptions
+                        F->setPersonalityFn(juliapersonality_func);
+                    }
+                }
+            }
+        }
+    });
+
+    JL_GC_POP();
+    if (timed) {
+        if (measure_compile_time_enabled) {
+            auto end = jl_hrtime();
+            jl_atomic_fetch_add_relaxed(&jl_cumulative_compile_time, end - compiler_start_time);
+        }
+        ct->reentrant_timing &= ~1ull;
+    }
+    return data;
+}
+
 // also be used be extern consumers like GPUCompiler.jl to obtain a module containing
 // all reachable & inferrrable functions.
-// The `policy` flag switches between the default mode `0` and the extern mode `1` used by GPUCompiler.
-// `_imaging_mode` controls if raw pointers can be embedded (e.g. the code will be loaded into the same session).
-// `_external_linkage` create linkages between pkgimages.
 extern "C" JL_DLLEXPORT_CODEGEN
-void *jl_create_native_impl(jl_array_t *methods, LLVMOrcThreadSafeModuleRef llvmmod, const jl_cgparams_t *cgparams, int _policy, int _imaging_mode, int _external_linkage, size_t _world, jl_codeinstance_lookup_t lookup)
+void *jl_emit_native_impl(jl_array_t *codeinfos, LLVMOrcThreadSafeModuleRef llvmmod, const jl_cgparams_t *cgparams, int _external_linkage)
 {
     JL_TIMING(NATIVE_AOT, NATIVE_Create);
     ++CreateNativeCalls;
-    CreateNativeMax.updateMax(jl_array_nrows(methods));
+    CreateNativeMax.updateMax(jl_array_nrows(codeinfos));
     if (cgparams == NULL)
         cgparams = &jl_default_cgparams;
-    if (lookup == NULL)
-        lookup = &jl_rettype_inferred_native;
     jl_native_code_desc_t *data = new jl_native_code_desc_t;
-    CompilationPolicy policy = (CompilationPolicy) _policy;
-    bool imaging = imaging_default() || _imaging_mode == 1;
     jl_method_instance_t *mi = NULL;
-    auto ct = jl_current_task;
-    bool timed = (ct->reentrant_timing & 1) == 0;
-    if (timed)
-        ct->reentrant_timing |= 1;
     orc::ThreadSafeContext ctx;
     orc::ThreadSafeModule backing;
     if (!llvmmod) {
@@ -577,13 +611,7 @@ void *jl_create_native_impl(jl_array_t *methods, LLVMOrcThreadSafeModuleRef llvm
     orc::ThreadSafeModule &clone = llvmmod ? *unwrap(llvmmod) : backing;
     auto ctxt = clone.getContext();
 
-    uint64_t compiler_start_time = 0;
-    uint8_t measure_compile_time_enabled = jl_atomic_load_relaxed(&jl_measure_compile_time_enabled);
-    if (measure_compile_time_enabled)
-        compiler_start_time = jl_hrtime();
-
     // compile all methods for the current world and type-inference world
-
     auto target_info = clone.withModuleDo([&](Module &M) {
         return std::make_pair(M.getDataLayout(), Triple(M.getTargetTriple()));
     });
@@ -592,84 +620,57 @@ void *jl_create_native_impl(jl_array_t *methods, LLVMOrcThreadSafeModuleRef llvm
     if (!llvmmod)
         params.getContext().setDiscardValueNames(true);
     params.params = cgparams;
-    params.imaging_mode = imaging;
+    assert(params.imaging_mode); // `_imaging_mode` controls if broken features like code-coverage are disabled
     params.external_linkage = _external_linkage;
     params.temporary_roots = jl_alloc_array_1d(jl_array_any_type, 0);
     JL_GC_PUSH3(&params.temporary_roots, &method_roots.list, &method_roots.keyset);
-    size_t compile_for[] = { jl_typeinf_world, _world };
-    int worlds = 0;
-    if (jl_options.trim != JL_TRIM_NO)
-        worlds = 1;
     jl_compiled_functions_t compiled_functions;
-    for (; worlds < 2; worlds++) {
-        JL_TIMING(NATIVE_AOT, NATIVE_Codegen);
-        size_t this_world = compile_for[worlds];
-        if (!this_world)
-            continue;
-        // Don't emit methods for the typeinf_world with extern policy
-        if (policy != CompilationPolicy::Default && this_world == jl_typeinf_world)
-            continue;
-        size_t i, l;
-        for (i = 0, l = jl_array_nrows(methods); i < l; i++) {
-            // each item in this list is either a MethodInstance indicating something
-            // to compile, or an svec(rettype, sig) describing a C-callable alias to create.
-            jl_value_t *item = jl_array_ptr_ref(methods, i);
-            if (jl_is_simplevector(item)) {
-                if (worlds == 1)
-                    jl_compile_extern_c(wrap(&clone), &params, NULL, jl_svecref(item, 0), jl_svecref(item, 1));
-                continue;
-            }
-            mi = (jl_method_instance_t*)item;
-            // if this method is generally visible to the current compilation world,
-            // and this is either the primary world, or not applicable in the primary world
-            // then we want to compile and emit this
-            if (jl_atomic_load_relaxed(&mi->def.method->primary_world) <= this_world && this_world <= jl_atomic_load_relaxed(&mi->def.method->deleted_world)) {
-                // find and prepare the source code to compile
-                jl_code_instance_t *codeinst = jl_ci_cache_lookup(mi, this_world, lookup);
-                JL_GC_PROMISE_ROOTED(codeinst);
-                if (jl_options.trim != JL_TRIM_NO && !codeinst) {
-                    // If we're building a small image, we need to compile everything
-                    // to ensure that we have all the information we need.
-                    jl_safe_printf("Codegen decided not to compile code root");
-                    jl_(mi);
-                    abort();
-                }
-                if (codeinst && !compiled_functions.count(codeinst) && !data->jl_fvar_map.count(codeinst)) {
-                    // now add it to our compilation results
-                    // Const returns do not do codegen, but juliac inspects codegen results so make a dummy fvar entry to represent it
-                    if (jl_options.trim != JL_TRIM_NO && jl_atomic_load_relaxed(&codeinst->invoke) == jl_fptr_const_return_addr) {
-                        data->jl_fvar_map[codeinst] = std::make_tuple((uint32_t)-3, (uint32_t)-3);
-                    }
-                    else {
-                        orc::ThreadSafeModule result_m = jl_create_ts_module(name_from_method_instance(jl_get_ci_mi(codeinst)),
-                                params.tsctx, clone.getModuleUnlocked()->getDataLayout(),
-                                Triple(clone.getModuleUnlocked()->getTargetTriple()));
-                        jl_llvm_functions_t decls = jl_emit_codeinst(result_m, codeinst, NULL, params);
-                        JL_GC_PROMISE_ROOTED(codeinst->def); // analyzer seems confused
-                        record_method_roots(method_roots, jl_get_ci_mi(codeinst));
-                        if (result_m)
-                            compiled_functions[codeinst] = {std::move(result_m), std::move(decls)};
-                        else if (jl_options.trim != JL_TRIM_NO) {
-                            // if we're building a small image, we need to compile everything
-                            // to ensure that we have all the information we need.
-                            jl_safe_printf("codegen failed to compile code root");
-                            jl_(mi);
-                            abort();
-                        }
-                    }
+    size_t i, l;
+    for (i = 0, l = jl_array_nrows(codeinfos); i < l; i++) {
+        // each item in this list is either a CodeInstance followed by a CodeInfo indicating something
+        // to compile, or a rettype followed by a sig describing a C-callable alias to create.
+        jl_value_t *item = jl_array_ptr_ref(codeinfos, i);
+        if (jl_is_code_instance(item)) {
+            // now add it to our compilation results
+            jl_code_instance_t *codeinst = (jl_code_instance_t*)item;
+            jl_code_info_t *src = (jl_code_info_t*)jl_array_ptr_ref(codeinfos, ++i);
+            assert(jl_is_code_info(src));
+            if (compiled_functions.count(codeinst))
+                continue; // skip any duplicates that accidentally made there way in here (or make this an error?)
+            if (_external_linkage) {
+                uint8_t specsigflags;
+                jl_callptr_t invoke;
+                void *fptr;
+                jl_read_codeinst_invoke(codeinst, &specsigflags, &invoke, &fptr, 0);
+                if (invoke != NULL && (specsigflags & 0b100)) {
+                    // this codeinst is already available externally
+                    // TODO: for performance, avoid generating the src code when we know it would reach here anyways
+                    continue;
                 }
             }
-            else if (this_world != jl_typeinf_world) {
-                /*
-                jl_safe_printf("Codegen could not find requested codeinstance to be compiled\n");
+            orc::ThreadSafeModule result_m = jl_create_ts_module(name_from_method_instance(jl_get_ci_mi(codeinst)),
+                    params.tsctx, clone.getModuleUnlocked()->getDataLayout(),
+                    Triple(clone.getModuleUnlocked()->getTargetTriple()));
+            jl_llvm_functions_t decls = jl_emit_codeinst(result_m, codeinst, src, params);
+            record_method_roots(method_roots, jl_get_ci_mi(codeinst));
+            if (result_m)
+                compiled_functions[codeinst] = {std::move(result_m), std::move(decls)};
+            else if (params.params->trim) {
+                // if we're building a small image, we need to compile everything
+                // to ensure that we have all the information we need.
+                jl_safe_printf("codegen failed to compile code root");
                 jl_(mi);
                 abort();
-                */
             }
         }
+        else {
+            jl_value_t *sig = jl_array_ptr_ref(codeinfos, ++i);
+            assert(jl_is_type(item) && jl_is_type(sig));
+            jl_compile_extern_c(wrap(&clone), &params, NULL, item, sig);
+        }
     }
-    // finally, make sure all referenced methods also get compiled or fixed up
-    compile_workqueue(params, method_roots, policy, compiled_functions);
+    // finally, make sure all referenced methods get fixed up, particularly if the user declined to compile them
+    resolve_workqueue(params, method_roots, compiled_functions);
     aot_optimize_roots(params, method_roots, compiled_functions);
     params.temporary_roots = nullptr;
     JL_GC_POP();
@@ -712,7 +713,6 @@ void *jl_create_native_impl(jl_array_t *methods, LLVMOrcThreadSafeModuleRef llvm
     // clones the contents of the module `m` to the shadow_output collector
     // while examining and recording what kind of function pointer we have
     {
-        JL_TIMING(NATIVE_AOT, NATIVE_Merge);
         Linker L(*clone.getModuleUnlocked());
         for (auto &def : compiled_functions) {
             jl_merge_module(clone, std::move(std::get<0>(def.second)));
@@ -762,45 +762,7 @@ void *jl_create_native_impl(jl_array_t *methods, LLVMOrcThreadSafeModuleRef llvm
     }
     CreateNativeGlobals += gvars.size();
 
-    //Safe b/c context is locked by params
-    auto TT = Triple(clone.getModuleUnlocked()->getTargetTriple());
-    Function *juliapersonality_func = nullptr;
-    if (TT.isOSWindows() && TT.getArch() == Triple::x86_64) {
-        // setting the function personality enables stack unwinding and catching exceptions
-        // so make sure everything has something set
-        Type *T_int32 = Type::getInt32Ty(clone.getModuleUnlocked()->getContext());
-        juliapersonality_func = Function::Create(FunctionType::get(T_int32, true),
-            Function::ExternalLinkage, "__julia_personality", clone.getModuleUnlocked());
-        juliapersonality_func->setDLLStorageClass(GlobalValue::DLLImportStorageClass);
-    }
-
-    // move everything inside, now that we've merged everything
-    // (before adding the exported headers)
-    if (policy == CompilationPolicy::Default) {
-        //Safe b/c context is locked by params
-        for (GlobalObject &G : clone.getModuleUnlocked()->global_objects()) {
-            if (!G.isDeclaration()) {
-                G.setLinkage(GlobalValue::InternalLinkage);
-                G.setDSOLocal(true);
-                makeSafeName(G);
-                if (Function *F = dyn_cast<Function>(&G)) {
-                    if (TT.isOSWindows() && TT.getArch() == Triple::x86_64) {
-                        // Add unwind exception personalities to functions to handle async exceptions
-                        F->setPersonalityFn(juliapersonality_func);
-                    }
-                }
-            }
-        }
-    }
-
     data->M = std::move(clone);
-    if (timed) {
-        if (measure_compile_time_enabled) {
-            auto end = jl_hrtime();
-            jl_atomic_fetch_add_relaxed(&jl_cumulative_compile_time, end - compiler_start_time);
-        }
-        ct->reentrant_timing &= ~1ull;
-    }
     return (void*)data;
 }
 
@@ -1954,8 +1916,7 @@ void jl_dump_native_impl(void *native_code,
         sysimg_outputs = compile(sysimgM, "sysimg", 1, [](Module &) {});
     }
 
-    bool imaging_mode = imaging_default() || jl_options.outputo;
-
+    const bool imaging_mode = true;
     unsigned threads = 1;
     unsigned nfvars = 0;
     unsigned ngvars = 0;
@@ -2062,8 +2023,7 @@ void jl_dump_native_impl(void *native_code,
         data_outputs = compile(*dataM, "text", threads, [data](Module &) { delete data; });
     }
 
-    if (params->emit_metadata)
-    {
+    if (params->emit_metadata) {
         JL_TIMING(NATIVE_AOT, NATIVE_Metadata);
         LLVMContext Context;
         Context.setDiscardValueNames(true);
@@ -2258,13 +2218,7 @@ void jl_get_llvmf_defn_impl(jl_llvmf_dump_t* dump, jl_method_instance_t *mi, jl_
         });
         jl_codegen_params_t output(ctx, std::move(target_info.first), std::move(target_info.second));
         output.params = &params;
-        output.imaging_mode = imaging_default();
-        // This would be nice, but currently it causes some assembly regressions that make printed output
-        // differ very significantly from the actual non-imaging mode code.
-        // // Force imaging mode for names of pointers
-        // output.imaging = true;
-        // This would also be nice, but it seems to cause OOMs on the windows32 builder
-        // To get correct names in the IR this needs to be at least 2
+        output.imaging_mode = jl_options.image_codegen;
         output.temporary_roots = jl_alloc_array_1d(jl_array_any_type, 0);
         JL_GC_PUSH1(&output.temporary_roots);
         auto decls = jl_emit_code(m, mi, src, NULL, output);
diff --git a/src/ccall.cpp b/src/ccall.cpp
index 3937570896f82..1b635ca40840f 100644
--- a/src/ccall.cpp
+++ b/src/ccall.cpp
@@ -697,14 +697,12 @@ static jl_cgval_t emit_cglobal(jl_codectx_t &ctx, jl_value_t **args, size_t narg
     }
     else if (sym.fptr != NULL) {
         res = ConstantInt::get(lrt, (uint64_t)sym.fptr);
-        if (ctx.emission_context.imaging_mode)
-            jl_printf(JL_STDERR,"WARNING: literal address used in cglobal for %s; code cannot be statically compiled\n", sym.f_name);
     }
     else if (sym.f_name != NULL) {
         if (sym.lib_expr) {
             res = runtime_sym_lookup(ctx, getPointerTy(ctx.builder.getContext()), NULL, sym.lib_expr, sym.f_name, ctx.f);
         }
-        else /*if (ctx.emission_context.imaging) */{
+        else {
             res = runtime_sym_lookup(ctx, getPointerTy(ctx.builder.getContext()), sym.f_lib, NULL, sym.f_name, ctx.f);
         }
     } else {
@@ -2134,8 +2132,6 @@ jl_cgval_t function_sig_t::emit_a_ccall(
         Type *funcptype = functype->getPointerTo(0);
         llvmf = literal_static_pointer_val((void*)(uintptr_t)symarg.fptr, funcptype);
         setName(ctx.emission_context, llvmf, "ccall_fptr");
-        if (ctx.emission_context.imaging_mode)
-            jl_printf(JL_STDERR,"WARNING: literal address used in ccall for %s; code cannot be statically compiled\n", symarg.f_name);
     }
     else if (!ctx.params->use_jlplt) {
         if ((symarg.f_lib && !((symarg.f_lib == JL_EXE_LIBNAME) ||
@@ -2152,7 +2148,7 @@ jl_cgval_t function_sig_t::emit_a_ccall(
             ++DeferredCCallLookups;
             llvmf = runtime_sym_lookup(ctx, funcptype, NULL, symarg.lib_expr, symarg.f_name, ctx.f);
         }
-        else /*if (ctx.emission_context.imaging) */{
+        else {
             ++DeferredCCallLookups;
             // vararg requires musttail,
             // but musttail is incompatible with noreturn.
diff --git a/src/codegen-stubs.c b/src/codegen-stubs.c
index fe50af3f8e84d..5e243ddda28c9 100644
--- a/src/codegen-stubs.c
+++ b/src/codegen-stubs.c
@@ -46,11 +46,26 @@ JL_DLLEXPORT void jl_generate_fptr_for_unspecialized_fallback(jl_code_instance_t
 
 JL_DLLEXPORT int jl_compile_codeinst_fallback(jl_code_instance_t *unspec)
 {
-    // Do nothing. The caller will notice that we failed to provide a an ->invoke and trigger
+    // Do nothing. The caller will notice that we failed to provide an ->invoke and trigger
     // appropriate fallbacks.
     return 0;
 }
 
+JL_DLLEXPORT void jl_emit_codeinst_to_jit_fallback(jl_code_instance_t *codeinst, jl_code_info_t *src)
+{
+    jl_value_t *inferred = jl_atomic_load_relaxed(&codeinst->inferred);
+    if (jl_is_code_info(inferred))
+        return;
+    if (jl_is_svec(src->edges)) {
+        jl_atomic_store_release(&codeinst->inferred, (jl_value_t*)src->edges);
+        jl_gc_wb(codeinst, src->edges);
+    }
+    jl_atomic_store_release(&codeinst->debuginfo, src->debuginfo);
+    jl_gc_wb(codeinst, src->debuginfo);
+    jl_atomic_store_release(&codeinst->inferred, (jl_value_t*)src);
+    jl_gc_wb(codeinst, src);
+}
+
 JL_DLLEXPORT uint32_t jl_get_LLVM_VERSION_fallback(void)
 {
     return 0;
@@ -70,7 +85,8 @@ JL_DLLEXPORT size_t jl_jit_total_bytes_fallback(void)
     return 0;
 }
 
-JL_DLLEXPORT void *jl_create_native_fallback(jl_array_t *methods, LLVMOrcThreadSafeModuleRef llvmmod, const jl_cgparams_t *cgparams, int _policy, int _imaging_mode, int _external_linkage, size_t _world, jl_codeinstance_lookup_t lookup) UNAVAILABLE
+JL_DLLEXPORT void *jl_create_native_fallback(jl_array_t *methods, LLVMOrcThreadSafeModuleRef llvmmod, int _trim, int _external_linkage, size_t _world) UNAVAILABLE
+JL_DLLEXPORT void *jl_emit_native_fallback(jl_array_t *codeinfos, LLVMOrcThreadSafeModuleRef llvmmod, const jl_cgparams_t *cgparams, int _external_linkage) UNAVAILABLE
 
 JL_DLLEXPORT void jl_dump_compiles_fallback(void *s)
 {
diff --git a/src/codegen.cpp b/src/codegen.cpp
index ceaba9507e4c7..04253b67d6b6d 100644
--- a/src/codegen.cpp
+++ b/src/codegen.cpp
@@ -6771,7 +6771,7 @@ static void emit_latestworld(jl_codectx_t &ctx)
     (void)store_world;
 }
 
-// `expr` is not clobbered in JL_TRY
+// `expr` is not actually clobbered in JL_TRY
 JL_GCC_IGNORE_START("-Wclobbered")
 static jl_cgval_t emit_expr(jl_codectx_t &ctx, jl_value_t *expr, ssize_t ssaidx_0based)
 {
@@ -7297,10 +7297,10 @@ Function *emit_tojlinvoke(jl_code_instance_t *codeinst, StringRef theFptrName, M
         theFarg = literal_pointer_val(ctx, (jl_value_t*)codeinst);
     }
     else {
-        jl_method_instance_t *mi = codeinst->def;
+        jl_method_instance_t *mi = jl_get_ci_mi(codeinst);
         bool is_opaque_closure = jl_is_method(mi->def.value) && mi->def.method->is_for_opaque_closure;
         theFunc = prepare_call(is_opaque_closure ? jlinvokeoc_func : jlinvoke_func);
-        theFarg = literal_pointer_val(ctx, (jl_value_t*)jl_get_ci_mi(codeinst));
+        theFarg = literal_pointer_val(ctx, (jl_value_t*)mi);
     }
     theFarg = track_pjlvalue(ctx, theFarg);
     auto args = f->arg_begin();
@@ -7466,7 +7466,7 @@ static Function *gen_cfun_wrapper(
 
     jl_code_instance_t *codeinst = NULL;
     if (lam) {
-        // TODO: this isn't ideal to be unconditionally calling type inference (and compile) from here
+        // TODO: this isn't ideal to be unconditionally calling type inference from here
         codeinst = jl_type_infer(lam, world, SOURCE_MODE_NOT_REQUIRED);
         astrt = codeinst->rettype;
         if (astrt != (jl_value_t*)jl_bottom_type &&
@@ -9975,9 +9975,6 @@ static jl_llvm_functions_t
 
 // --- entry point ---
 
-void jl_add_code_in_flight(StringRef name, jl_code_instance_t *codeinst, const DataLayout &DL);
-
-JL_GCC_IGNORE_START("-Wclobbered")
 jl_llvm_functions_t jl_emit_code(
         orc::ThreadSafeModule &m,
         jl_method_instance_t *li,
@@ -10023,15 +10020,6 @@ jl_llvm_functions_t jl_emit_code(
     return decls;
 }
 
-static int effects_foldable(uint32_t effects)
-{
-    // N.B.: This needs to be kept in sync with Core.Compiler.is_foldable(effects, true)
-    return ((effects & 0x7) == 0) && // is_consistent(effects)
-           (((effects >> 10) & 0x03) == 0) && // is_noub(effects)
-           (((effects >> 3) & 0x03) == 0) && // is_effect_free(effects)
-           ((effects >> 6) & 0x01); // is_terminates(effects)
-}
-
 static jl_llvm_functions_t jl_emit_oc_wrapper(orc::ThreadSafeModule &m, jl_codegen_params_t &params, jl_method_instance_t *mi, jl_value_t *rettype)
 {
     jl_llvm_functions_t declarations;
@@ -10082,67 +10070,8 @@ jl_llvm_functions_t jl_emit_codeinst(
             return jl_llvm_functions_t(); // failed
         }
     }
-    assert(jl_egal((jl_value_t*)jl_atomic_load_relaxed(&codeinst->debuginfo), (jl_value_t*)src->debuginfo) && "trying to generate code for a codeinst for an incompatible src");
+    //assert(jl_egal((jl_value_t*)jl_atomic_load_relaxed(&codeinst->debuginfo), (jl_value_t*)src->debuginfo) && "trying to generate code for a codeinst for an incompatible src");
     jl_llvm_functions_t decls = jl_emit_code(m, jl_get_ci_mi(codeinst), src, get_ci_abi(codeinst), params);
-
-    const std::string &specf = decls.specFunctionObject;
-    const std::string &f = decls.functionObject;
-    if (params.cache && !f.empty()) {
-        // Prepare debug info to receive this function
-        // record that this function name came from this linfo,
-        // so we can build a reverse mapping for debug-info.
-        bool toplevel = !jl_is_method(jl_get_ci_mi(codeinst)->def.method);
-        if (!toplevel) {
-            //Safe b/c params holds context lock
-            const DataLayout &DL = m.getModuleUnlocked()->getDataLayout();
-            // but don't remember toplevel thunks because
-            // they may not be rooted in the gc for the life of the program,
-            // and the runtime doesn't notify us when the code becomes unreachable :(
-            if (!specf.empty())
-                jl_add_code_in_flight(specf, codeinst, DL);
-            if (!f.empty() && f != "jl_fptr_args" && f != "jl_fptr_sparam")
-                jl_add_code_in_flight(f, codeinst, DL);
-        }
-
-        jl_value_t *inferred = jl_atomic_load_relaxed(&codeinst->inferred);
-        // don't change inferred state
-        if (inferred) {
-            jl_method_t *def = jl_get_ci_mi(codeinst)->def.method;
-            if (// keep code when keeping everything
-                !(JL_DELETE_NON_INLINEABLE) ||
-                // aggressively keep code when debugging level >= 2
-                // note that this uses the global jl_options.debug_level, not the local emission_ctx.debug_info_level
-                jl_options.debug_level > 1) {
-                // update the stored code
-                if (inferred != (jl_value_t*)src) {
-                    // TODO: it is somewhat unclear what it means to be mutating this
-                    if (jl_is_method(def)) {
-                        src = (jl_code_info_t*)jl_compress_ir(def, src);
-                        assert(jl_is_string(src));
-                        codeinst->relocatability = jl_string_data(src)[jl_string_len(src)-1];
-                    }
-                    jl_atomic_store_release(&codeinst->inferred, (jl_value_t*)src);
-                    jl_gc_wb(codeinst, src);
-                }
-            }
-            // delete non-inlineable code, since it won't be needed again
-            // because we already emitted LLVM code from it and the native
-            // Julia-level optimization will never need to see it
-            else if (jl_is_method(def) && // don't delete toplevel code
-                        def->source != NULL && // don't delete code from optimized opaque closures that can't be reconstructed
-                        inferred != jl_nothing && // and there is something to delete (test this before calling jl_ir_inlining_cost)
-                        ((!effects_foldable(jl_atomic_load_relaxed(&codeinst->ipo_purity_bits)) && // don't delete code we may want for irinterp
-                          (jl_ir_inlining_cost(inferred) == UINT16_MAX) && // don't delete inlineable code
-                          !jl_generating_output()) || // don't delete code when generating a precompile file, trading memory in the short term for avoiding likely duplicating inference work for aotcompile
-                         jl_atomic_load_relaxed(&codeinst->invoke) == jl_fptr_const_return_addr)) { // unless it is constant (although this shouldn't have had code in the first place)
-                // Never end up in a situation where the codeinst has no invoke, but also no source, so we never fall
-                // through the cracks of SOURCE_MODE_ABI.
-                jl_callptr_t expected = NULL;
-                jl_atomic_cmpswap_relaxed(&codeinst->invoke, &expected, jl_fptr_wait_for_compiled_addr);
-                jl_atomic_store_release(&codeinst->inferred, jl_nothing);
-            }
-        }
-    }
     JL_GC_POP();
     return decls;
 }
diff --git a/src/gf.c b/src/gf.c
index 3a308ca6154fc..390ba72143f5d 100644
--- a/src/gf.c
+++ b/src/gf.c
@@ -343,8 +343,12 @@ jl_datatype_t *jl_mk_builtin_func(jl_datatype_t *dt, const char *name, jl_fptr_a
 // if inference doesn't occur (or can't finish), returns NULL instead
 jl_code_instance_t *jl_type_infer(jl_method_instance_t *mi, size_t world, uint8_t source_mode)
 {
-    if (jl_typeinf_func == NULL)
-        return jl_method_inferred_with_abi(mi, world);
+    if (jl_typeinf_func == NULL) {
+        if (source_mode == SOURCE_MODE_ABI)
+            return jl_method_inferred_with_abi(mi, world);
+        else
+            return NULL;
+    }
     jl_task_t *ct = jl_current_task;
     if (ct->reentrant_timing & 0b1000) {
         // We must avoid attempting to re-enter inference here
@@ -366,6 +370,10 @@ jl_code_instance_t *jl_type_infer(jl_method_instance_t *mi, size_t world, uint8_
     fargs[1] = (jl_value_t*)mi;
     fargs[2] = jl_box_ulong(world);
     fargs[3] = jl_box_uint8(source_mode);
+    int last_errno = errno;
+#ifdef _OS_WINDOWS_
+    DWORD last_error = GetLastError();
+#endif
 
     jl_timing_show_method_instance(mi, JL_TIMING_DEFAULT_BLOCK);
 #ifdef TRACE_INFERENCE
@@ -374,10 +382,6 @@ jl_code_instance_t *jl_type_infer(jl_method_instance_t *mi, size_t world, uint8_
         jl_static_show_func_sig(JL_STDERR, (jl_value_t*)mi->specTypes);
         jl_printf(JL_STDERR, "\n");
     }
-#endif
-    int last_errno = errno;
-#ifdef _OS_WINDOWS_
-    DWORD last_error = GetLastError();
 #endif
     int last_pure = ct->ptls->in_pure_callback;
     ct->ptls->in_pure_callback = 0;
@@ -2745,6 +2749,25 @@ void jl_read_codeinst_invoke(jl_code_instance_t *ci, uint8_t *specsigflags, jl_c
 
 jl_method_instance_t *jl_normalize_to_compilable_mi(jl_method_instance_t *mi JL_PROPAGATES_ROOT);
 
+JL_DLLEXPORT void jl_add_codeinst_to_jit(jl_code_instance_t *codeinst, jl_code_info_t *src)
+{
+    assert(jl_is_code_info(src));
+    jl_emit_codeinst_to_jit(codeinst, src);
+    jl_method_instance_t *mi = jl_get_ci_mi(codeinst);
+    if (jl_generating_output() && jl_is_method(mi->def.method) && jl_atomic_load_relaxed(&codeinst->inferred) == jl_nothing) {
+        jl_value_t *compressed = jl_compress_ir(mi->def.method, src);
+        // These should already be compatible (and should be an assert), but make sure of it anyways
+        if (jl_is_svec(src->edges)) {
+            jl_atomic_store_release(&codeinst->edges, (jl_svec_t*)src->edges);
+            jl_gc_wb(codeinst, src->edges);
+        }
+        jl_atomic_store_release(&codeinst->debuginfo, src->debuginfo);
+        jl_gc_wb(codeinst, src->debuginfo);
+        jl_atomic_store_release(&codeinst->inferred, compressed);
+        jl_gc_wb(codeinst, compressed);
+    }
+}
+
 jl_code_instance_t *jl_compile_method_internal(jl_method_instance_t *mi, size_t world)
 {
     // quick check if we already have a compiled result
@@ -2815,8 +2838,7 @@ jl_code_instance_t *jl_compile_method_internal(jl_method_instance_t *mi, size_t
             jl_method_instance_t *unspecmi = jl_atomic_load_relaxed(&def->unspecialized);
             if (unspecmi) {
                 jl_code_instance_t *unspec = jl_atomic_load_relaxed(&unspecmi->cache);
-                jl_callptr_t unspec_invoke = NULL;
-                if (unspec && (unspec_invoke = jl_atomic_load_acquire(&unspec->invoke))) {
+                if (unspec && jl_atomic_load_acquire(&unspec->invoke) != NULL) {
                     uint8_t specsigflags;
                     jl_callptr_t invoke;
                     void *fptr;
@@ -2886,7 +2908,7 @@ jl_code_instance_t *jl_compile_method_internal(jl_method_instance_t *mi, size_t
     }
 
     if (codeinst) {
-        if (jl_atomic_load_acquire(&codeinst->invoke) != NULL) {
+        if (jl_is_compiled_codeinst(codeinst)) {
             jl_typeinf_timing_end(start, is_recompile);
             // Already compiled - e.g. constabi, or compiled by a different thread while we were waiting.
             return codeinst;
@@ -2972,12 +2994,32 @@ jl_value_t *jl_fptr_wait_for_compiled(jl_value_t *f, jl_value_t **args, uint32_t
 {
     jl_callptr_t invoke = jl_atomic_load_acquire(&m->invoke);
     if (invoke == &jl_fptr_wait_for_compiled) {
+        int64_t last_alloc = jl_options.malloc_log ? jl_gc_diff_total_bytes() : 0;
+        int last_errno = errno;
+#ifdef _OS_WINDOWS_
+        DWORD last_error = GetLastError();
+#endif
         jl_compile_codeinst(m);
+#ifdef _OS_WINDOWS_
+        SetLastError(last_error);
+#endif
+        errno = last_errno;
+        if (jl_options.malloc_log)
+            jl_gc_sync_total_bytes(last_alloc); // discard allocation count from compilation
         invoke = jl_atomic_load_acquire(&m->invoke);
     }
     return invoke(f, args, nargs, m);
 }
 
+// test whether codeinst->invoke is usable already without further compilation needed
+JL_DLLEXPORT int jl_is_compiled_codeinst(jl_code_instance_t *codeinst)
+{
+    jl_callptr_t invoke = jl_atomic_load_relaxed(&codeinst->invoke);
+    if (invoke == NULL || invoke == &jl_fptr_wait_for_compiled)
+        return 0;
+    return 1;
+}
+
 JL_DLLEXPORT const jl_callptr_t jl_fptr_args_addr = &jl_fptr_args;
 
 JL_DLLEXPORT const jl_callptr_t jl_fptr_const_return_addr = &jl_fptr_const_return;
diff --git a/src/jitlayers.cpp b/src/jitlayers.cpp
index 139a5631d63c0..e2bf75d346613 100644
--- a/src/jitlayers.cpp
+++ b/src/jitlayers.cpp
@@ -272,7 +272,7 @@ static void finish_params(Module *M, jl_codegen_params_t &params) JL_NOTSAFEPOIN
     // the fiction that we don't know what loads from the global will return. Thus, we
     // need to emit a separate module for the globals before any functions are compiled,
     // to ensure that the globals are defined when they are compiled.
-    if (params.imaging_mode) {
+    if (jl_options.image_codegen) {
         if (!params.global_targets.empty()) {
             void **globalslots = new void*[params.global_targets.size()];
             void **slot = globalslots;
@@ -304,7 +304,8 @@ static void finish_params(Module *M, jl_codegen_params_t &params) JL_NOTSAFEPOIN
 // look for something with an egal ABI that is already in the JIT
 static jl_code_instance_t *jl_method_compiled_egal(jl_code_instance_t *ci JL_PROPAGATES_ROOT) JL_NOTSAFEPOINT
 {
-    jl_method_instance_t *mi = ci->def;
+    jl_value_t *def = ci->def;
+    jl_method_instance_t *mi = jl_get_ci_mi(ci);
     jl_value_t *owner = ci->owner;
     jl_value_t *rettype = ci->rettype;
     size_t min_world = jl_atomic_load_relaxed(&ci->min_world);
@@ -316,6 +317,7 @@ static jl_code_instance_t *jl_method_compiled_egal(jl_code_instance_t *ci JL_PRO
             jl_atomic_load_relaxed(&codeinst->invoke) != NULL &&
             jl_atomic_load_relaxed(&codeinst->min_world) <= min_world &&
             jl_atomic_load_relaxed(&codeinst->max_world) >= max_world &&
+            jl_egal(codeinst->def, def) &&
             jl_egal(codeinst->owner, owner) &&
             jl_egal(codeinst->rettype, rettype)) {
             return codeinst;
@@ -503,15 +505,6 @@ static int jl_analyze_workqueue(jl_code_instance_t *callee, jl_codegen_params_t
     return params.workqueue.size();
 }
 
-// test whether codeinst->invoke is usable already without further compilation needed
-static bool jl_is_compiled_codeinst(jl_code_instance_t *codeinst) JL_NOTSAFEPOINT
-{
-    auto invoke = jl_atomic_load_relaxed(&codeinst->invoke);
-    if (invoke == nullptr || invoke == jl_fptr_wait_for_compiled_addr)
-        return false;
-    return true;
-}
-
 // move codeinst (and deps) from incompletemodules to emitted modules
 // and populate compileready from complete_graph
 static void prepare_compile(jl_code_instance_t *codeinst) JL_NOTSAFEPOINT_LEAVE JL_NOTSAFEPOINT_ENTER
@@ -732,10 +725,15 @@ static void jl_compile_codeinst_now(jl_code_instance_t *codeinst)
     }
 }
 
-static void jl_emit_codeinst_to_jit(
+void jl_add_code_in_flight(StringRef name, jl_code_instance_t *codeinst, const DataLayout &DL);
+
+extern "C" JL_DLLEXPORT_CODEGEN
+void jl_emit_codeinst_to_jit_impl(
         jl_code_instance_t *codeinst,
         jl_code_info_t *src)
 {
+    if (jl_is_compiled_codeinst(codeinst))
+        return;
     { // lock scope
         jl_unique_gcsafe_lock lock(engine_lock);
         if (invokenames.count(codeinst) || jl_is_compiled_codeinst(codeinst))
@@ -746,7 +744,7 @@ static void jl_emit_codeinst_to_jit(
     jl_codegen_params_t params(std::make_unique<LLVMContext>(), jl_ExecutionEngine->getDataLayout(), jl_ExecutionEngine->getTargetTriple()); // Locks the context
     params.getContext().setDiscardValueNames(true);
     params.cache = true;
-    params.imaging_mode = imaging_default();
+    params.imaging_mode = 0;
     orc::ThreadSafeModule result_m =
         jl_create_ts_module(name_from_method_instance(jl_get_ci_mi(codeinst)), params.tsctx, params.DL, params.TargetTriple);
     params.temporary_roots = jl_alloc_array_1d(jl_array_any_type, 0);
@@ -765,6 +763,24 @@ static void jl_emit_codeinst_to_jit(
     jl_unique_gcsafe_lock lock(engine_lock);
     if (invokenames.count(codeinst) || jl_is_compiled_codeinst(codeinst))
         return; // destroy everything
+    const std::string &specf = decls.specFunctionObject;
+    const std::string &f = decls.functionObject;
+    assert(!f.empty());
+    // Prepare debug info to receive this function
+    // record that this function name came from this linfo,
+    // so we can build a reverse mapping for debug-info.
+    bool toplevel = !jl_is_method(jl_get_ci_mi(codeinst)->def.method);
+    if (!toplevel) {
+        // don't remember toplevel thunks because
+        // they may not be rooted in the gc for the life of the program,
+        // and the runtime doesn't notify us when the code becomes unreachable :(
+        if (!specf.empty())
+            jl_add_code_in_flight(specf, codeinst, params.DL);
+        if (f != "jl_fptr_args" && f != "jl_fptr_sparam")
+            jl_add_code_in_flight(f, codeinst, params.DL);
+    }
+    jl_callptr_t expected = NULL;
+    jl_atomic_cmpswap_relaxed(&codeinst->invoke, &expected, jl_fptr_wait_for_compiled_addr);
     invokenames[codeinst] = std::move(decls);
     complete_emit(codeinst);
     params.tsctx_lock = params.tsctx.getLock(); // re-acquire lock
@@ -853,7 +869,7 @@ int jl_compile_extern_c_impl(LLVMOrcThreadSafeModuleRef llvmmod, void *p, void *
         jl_codegen_params_t params(into->getContext(), DL, TargetTriple);
         if (pparams == NULL) {
             params.cache = p == NULL;
-            params.imaging_mode = imaging_default();
+            params.imaging_mode = 0;
             params.tsctx.getContext()->setDiscardValueNames(true);
             pparams = &params;
         }
@@ -1425,7 +1441,7 @@ namespace {
 #endif
 #endif
         uint32_t target_flags = 0;
-        auto target = jl_get_llvm_target(imaging_default(), target_flags);
+        auto target = jl_get_llvm_target(jl_generating_output(), target_flags);
         auto &TheCPU = target.first;
         SmallVector<std::string, 10> targetFeatures(target.second.begin(), target.second.end());
         std::string errorstr;
diff --git a/src/jitlayers.h b/src/jitlayers.h
index 342cb516debd7..b41f2e38bc470 100644
--- a/src/jitlayers.h
+++ b/src/jitlayers.h
@@ -68,8 +68,6 @@
 
 using namespace llvm;
 
-extern "C" jl_cgparams_t jl_default_cgparams;
-
 DEFINE_SIMPLE_CONVERSION_FUNCTIONS(orc::ThreadSafeContext, LLVMOrcThreadSafeContextRef)
 DEFINE_SIMPLE_CONVERSION_FUNCTIONS(orc::ThreadSafeModule, LLVMOrcThreadSafeModuleRef)
 
@@ -78,10 +76,6 @@ void jl_merge_module(orc::ThreadSafeModule &dest, orc::ThreadSafeModule src) JL_
 GlobalVariable *jl_emit_RTLD_DEFAULT_var(Module *M) JL_NOTSAFEPOINT;
 DataLayout jl_create_datalayout(TargetMachine &TM) JL_NOTSAFEPOINT;
 
-static inline bool imaging_default() JL_NOTSAFEPOINT {
-    return jl_options.image_codegen || (jl_generating_output() && (!jl_options.incremental || jl_options.use_pkgimages));
-}
-
 struct OptimizationOptions {
     bool lower_intrinsics;
     bool dump_native;
@@ -265,7 +259,7 @@ struct jl_codegen_params_t {
         tsctx_lock(tsctx.getLock()),
         DL(std::move(DL)),
         TargetTriple(std::move(triple)),
-        imaging_mode(imaging_default())
+        imaging_mode(1)
     {
         // LLVM's RISC-V back-end currently does not support the Swift calling convention
         if (TargetTriple.isRISCV())
diff --git a/src/jl_exported_funcs.inc b/src/jl_exported_funcs.inc
index 71a78b1c20fc7..cb48cf6f9962c 100644
--- a/src/jl_exported_funcs.inc
+++ b/src/jl_exported_funcs.inc
@@ -240,7 +240,6 @@
     XX(jl_has_typevar_from_unionall) \
     XX(jl_hrtime) \
     XX(jl_idtable_rehash) \
-    XX(jl_infer_thunk) \
     XX(jl_init) \
     XX(jl_init_options) \
     XX(jl_init_restored_module) \
@@ -517,6 +516,7 @@
 #define JL_CODEGEN_EXPORTED_FUNCS(YY) \
     YY(jl_dump_function_ir) \
     YY(jl_dump_method_asm) \
+    YY(jl_emit_codeinst_to_jit) \
     YY(jl_extern_c) \
     YY(jl_get_llvmf_defn) \
     YY(jl_get_llvm_function) \
@@ -542,6 +542,7 @@
     YY(jl_dump_emitted_mi_name) \
     YY(jl_dump_llvm_opt) \
     YY(jl_dump_fptr_asm) \
+    YY(jl_emit_native) \
     YY(jl_get_function_id) \
     YY(jl_type_to_llvm) \
     YY(jl_getUnwindInfo) \
diff --git a/src/julia.h b/src/julia.h
index 9b8d090fe37cb..5cafc9bfa5232 100644
--- a/src/julia.h
+++ b/src/julia.h
@@ -1580,6 +1580,7 @@ static inline int jl_field_isconst(jl_datatype_t *st, int i) JL_NOTSAFEPOINT
 #define jl_is_addrspacecore(v) jl_typetagis(v,jl_addrspacecore_type)
 #define jl_is_abioverride(v) jl_typetagis(v,jl_abioverride_type)
 #define jl_genericmemory_isbitsunion(a) (((jl_datatype_t*)jl_typetagof(a))->layout->flags.arrayelem_isunion)
+#define jl_is_array_any(v)    jl_typetagis(v,jl_array_any_type)
 
 JL_DLLEXPORT int jl_subtype(jl_value_t *a, jl_value_t *b);
 
diff --git a/src/julia_internal.h b/src/julia_internal.h
index f0debd31e2fec..7e91e23b9087d 100644
--- a/src/julia_internal.h
+++ b/src/julia_internal.h
@@ -400,7 +400,7 @@ extern arraylist_t eytzinger_image_tree;
 extern arraylist_t eytzinger_idxs;
 
 extern JL_DLLEXPORT size_t jl_page_size;
-extern jl_function_t *jl_typeinf_func JL_GLOBALLY_ROOTED;
+extern JL_DLLEXPORT jl_function_t *jl_typeinf_func JL_GLOBALLY_ROOTED;
 extern JL_DLLEXPORT size_t jl_typeinf_world;
 extern _Atomic(jl_typemap_entry_t*) call_cache[N_CALL_CACHE] JL_GLOBALLY_ROOTED;
 
@@ -667,7 +667,6 @@ typedef union {
 // Also defined in typeinfer.jl - See documentation there.
 #define SOURCE_MODE_NOT_REQUIRED            0x0
 #define SOURCE_MODE_ABI                     0x1
-#define SOURCE_MODE_FORCE_SOURCE            0x2
 
 JL_DLLEXPORT jl_code_instance_t *jl_engine_reserve(jl_method_instance_t *m, jl_value_t *owner);
 JL_DLLEXPORT void jl_engine_fulfill(jl_code_instance_t *ci, jl_code_info_t *src);
@@ -683,6 +682,7 @@ JL_DLLEXPORT jl_code_instance_t *jl_get_method_inferred(
 JL_DLLEXPORT jl_method_instance_t *jl_get_unspecialized(jl_method_t *def JL_PROPAGATES_ROOT);
 JL_DLLEXPORT void jl_read_codeinst_invoke(jl_code_instance_t *ci, uint8_t *specsigflags, jl_callptr_t *invoke, void **specptr, int waitcompile);
 JL_DLLEXPORT jl_method_instance_t *jl_method_match_to_mi(jl_method_match_t *match, size_t world, size_t min_valid, size_t max_valid, int mt_cache);
+JL_DLLEXPORT void jl_add_codeinst_to_jit(jl_code_instance_t *codeinst, jl_code_info_t *src);
 
 JL_DLLEXPORT jl_code_instance_t *jl_new_codeinst_uninit(jl_method_instance_t *mi, jl_value_t *owner);
 JL_DLLEXPORT jl_code_instance_t *jl_new_codeinst(
@@ -695,9 +695,11 @@ JL_DLLEXPORT jl_code_instance_t *jl_new_codeinst(
 
 STATIC_INLINE jl_method_instance_t *jl_get_ci_mi(jl_code_instance_t *ci JL_PROPAGATES_ROOT) JL_NOTSAFEPOINT
 {
-    if (jl_is_abioverride(ci->def))
-        return ((jl_abi_override_t*)ci->def)->def;
-    return (jl_method_instance_t*)ci->def;
+    jl_value_t *def = ci->def;
+    if (jl_is_abioverride(def))
+        return ((jl_abi_override_t*)def)->def;
+    assert(jl_is_method_instance(def));
+    return (jl_method_instance_t*)def;
 }
 
 JL_DLLEXPORT const char *jl_debuginfo_file(jl_debuginfo_t *debuginfo) JL_NOTSAFEPOINT;
@@ -705,6 +707,7 @@ JL_DLLEXPORT const char *jl_debuginfo_file1(jl_debuginfo_t *debuginfo) JL_NOTSAF
 JL_DLLEXPORT jl_module_t *jl_debuginfo_module1(jl_value_t *debuginfo_def) JL_NOTSAFEPOINT;
 JL_DLLEXPORT const char *jl_debuginfo_name(jl_value_t *func) JL_NOTSAFEPOINT;
 
+JL_DLLEXPORT int jl_is_compiled_codeinst(jl_code_instance_t *codeinst) JL_NOTSAFEPOINT;
 JL_DLLEXPORT void jl_compile_method_instance(jl_method_instance_t *mi, jl_tupletype_t *types, size_t world);
 JL_DLLEXPORT void jl_compile_method_sig(jl_method_t *m, jl_value_t *types, jl_svec_t *sparams, size_t world);
 JL_DLLEXPORT int jl_compile_hint(jl_tupletype_t *types);
@@ -1944,6 +1947,7 @@ JL_DLLEXPORT uint32_t jl_crc32c(uint32_t crc, const char *buf, size_t len);
 JL_DLLIMPORT void jl_generate_fptr_for_unspecialized(jl_code_instance_t *unspec);
 JL_DLLIMPORT int jl_compile_codeinst(jl_code_instance_t *unspec);
 JL_DLLIMPORT int jl_compile_extern_c(LLVMOrcThreadSafeModuleRef llvmmod, void *params, void *sysimg, jl_value_t *declrt, jl_value_t *sigt);
+JL_DLLIMPORT void jl_emit_codeinst_to_jit(jl_code_instance_t *codeinst, jl_code_info_t *src);
 
 typedef struct {
     LLVMOrcThreadSafeModuleRef TSM;
@@ -1958,7 +1962,8 @@ JL_DLLIMPORT jl_value_t *jl_dump_function_ir(jl_llvmf_dump_t *dump, char strip_i
 JL_DLLIMPORT jl_value_t *jl_dump_function_asm(jl_llvmf_dump_t *dump, char emit_mc, const char* asm_variant, const char *debuginfo, char binary, char raw);
 
 typedef jl_value_t *(*jl_codeinstance_lookup_t)(jl_method_instance_t *mi JL_PROPAGATES_ROOT, size_t min_world, size_t max_world);
-JL_DLLIMPORT void *jl_create_native(jl_array_t *methods, LLVMOrcThreadSafeModuleRef llvmmod, const jl_cgparams_t *cgparams, int policy, int imaging_mode, int cache, size_t world, jl_codeinstance_lookup_t lookup);
+JL_DLLIMPORT void *jl_create_native(jl_array_t *methods, LLVMOrcThreadSafeModuleRef llvmmod, int trim, int cache, size_t world);
+JL_DLLIMPORT void *jl_emit_native(jl_array_t *codeinfos, LLVMOrcThreadSafeModuleRef llvmmod, const jl_cgparams_t *cgparams, int _external_linkage);
 JL_DLLIMPORT void jl_dump_native(void *native_code,
         const char *bc_fname, const char *unopt_bc_fname, const char *obj_fname, const char *asm_fname,
         ios_t *z, ios_t *s, jl_emission_params_t *params);
diff --git a/src/precompile_utils.c b/src/precompile_utils.c
index f2777455c4ed1..8906b3eb586d3 100644
--- a/src/precompile_utils.c
+++ b/src/precompile_utils.c
@@ -203,8 +203,8 @@ static int precompile_enq_specialization_(jl_method_instance_t *mi, void *closur
         else if (jl_atomic_load_relaxed(&codeinst->invoke) != jl_fptr_const_return) {
             jl_value_t *inferred = jl_atomic_load_relaxed(&codeinst->inferred);
             if (inferred &&
-                inferred != jl_nothing &&
-                (jl_options.compile_enabled != JL_OPTIONS_COMPILE_ALL && jl_ir_inlining_cost(inferred) == UINT16_MAX)) {
+                (jl_options.compile_enabled == JL_OPTIONS_COMPILE_ALL || inferred == jl_nothing ||
+                 ((jl_is_string(inferred) || jl_is_code_info(inferred)) && jl_ir_inlining_cost(inferred) == UINT16_MAX))) {
                 do_compile = 1;
             }
             else if (jl_atomic_load_relaxed(&codeinst->invoke) != NULL || jl_atomic_load_relaxed(&codeinst->precompile)) {
@@ -275,9 +275,7 @@ static void *jl_precompile_(jl_array_t *m, int external_linkage)
             jl_array_ptr_1d_push(m2, item);
         }
     }
-    void *native_code = jl_create_native(m2, NULL, NULL, 0, 1, external_linkage,
-                                         jl_atomic_load_acquire(&jl_world_counter),
-                                         NULL);
+    void *native_code = jl_create_native(m2, NULL, 0, external_linkage, jl_atomic_load_acquire(&jl_world_counter));
     JL_GC_POP();
     return native_code;
 }
@@ -372,8 +370,7 @@ static void *jl_precompile_trimmed(size_t world)
     jl_value_t *ccallable = NULL;
     JL_GC_PUSH2(&m, &ccallable);
     jl_method_instance_t *mi;
-    while (1)
-    {
+    while (1) {
         mi = (jl_method_instance_t*)arraylist_pop(jl_entrypoint_mis);
         if (mi == NULL)
             break;
@@ -385,10 +382,7 @@ static void *jl_precompile_trimmed(size_t world)
             jl_array_ptr_1d_push(m, ccallable);
     }
 
-    jl_cgparams_t params = jl_default_cgparams;
-    params.trim = jl_options.trim;
-    void *native_code = jl_create_native(m, NULL, &params, 0, /* imaging */ 1, 0,
-                                         world, NULL);
+    void *native_code = jl_create_native(m, NULL, jl_options.trim, 0, world);
     JL_GC_POP();
     return native_code;
 }
diff --git a/src/toplevel.c b/src/toplevel.c
index 44b503d4e1463..11e1f9ad521a1 100644
--- a/src/toplevel.c
+++ b/src/toplevel.c
@@ -1054,7 +1054,7 @@ JL_DLLEXPORT jl_value_t *jl_toplevel_eval_flex(jl_module_t *JL_NONNULL m, jl_val
         size_t world = jl_atomic_load_acquire(&jl_world_counter);
         ct->world_age = world;
         if (!has_defs && jl_get_module_infer(m) != 0) {
-            (void)jl_type_infer(mfunc, world, SOURCE_MODE_NOT_REQUIRED);
+            (void)jl_type_infer(mfunc, world, SOURCE_MODE_ABI);
         }
         result = jl_invoke(/*func*/NULL, /*args*/NULL, /*nargs*/0, mfunc);
         ct->world_age = last_age;
@@ -1138,20 +1138,6 @@ JL_DLLEXPORT jl_value_t *jl_toplevel_eval_in(jl_module_t *m, jl_value_t *ex)
     return v;
 }
 
-JL_DLLEXPORT jl_value_t *jl_infer_thunk(jl_code_info_t *thk, jl_module_t *m)
-{
-    jl_method_instance_t *li = jl_method_instance_for_thunk(thk, m);
-    JL_GC_PUSH1(&li);
-    jl_resolve_globals_in_ir((jl_array_t*)thk->code, m, NULL, 0);
-    jl_task_t *ct = jl_current_task;
-    jl_code_instance_t *ci = jl_type_infer(li, ct->world_age, SOURCE_MODE_NOT_REQUIRED);
-    JL_GC_POP();
-    if (ci)
-        return ci->rettype;
-    return (jl_value_t*)jl_any_type;
-}
-
-
 //------------------------------------------------------------------------------
 // Code loading: combined parse+eval for include()
 
diff --git a/test/precompile.jl b/test/precompile.jl
index d9bbf524609c9..2d4a4d310b4e0 100644
--- a/test/precompile.jl
+++ b/test/precompile.jl
@@ -750,10 +750,9 @@ precompile_test_harness("code caching") do dir
               struct X end
               struct X2 end
               @noinline function f(d)
-                  @noinline
-                  d[X()] = nothing
+                  @noinline d[X()] = nothing
               end
-              @noinline fpush(dest) = push!(dest, X())
+              @noinline fpush(dest) = @noinline push!(dest, X())
               function callboth()
                   f(Dict{X,Any}())
                   fpush(X[])
@@ -797,18 +796,6 @@ precompile_test_harness("code caching") do dir
         end
     end
     @test hasspec
-    # Test that compilation adds to method roots with appropriate provenance
-    m = which(setindex!, (Dict{M.X,Any}, Any, M.X))
-    @test Memory{M.X} ∈ m.roots
-    # Check that roots added outside of incremental builds get attributed to a moduleid of 0
-    Base.invokelatest() do
-        Dict{M.X2,Any}()[M.X2()] = nothing
-    end
-    @test Memory{M.X2} ∈ m.roots
-    groups = group_roots(m)
-    @test Memory{M.X} ∈ groups[Mid]           # attributed to M
-    @test Memory{M.X2} ∈ groups[0]            # activate module is not known
-    @test !isempty(groups[Bid])
     # Check that internal methods and their roots are accounted appropriately
     minternal = which(M.getelsize, (Vector,))
     mi = minternal.specializations::Core.MethodInstance
@@ -832,12 +819,12 @@ precompile_test_harness("code caching") do dir
           """
           module $Cache_module2
               struct Y end
-              @noinline f(dest) = push!(dest, Y())
+              @noinline f(dest) = @noinline push!(dest, Y())
               callf() = f(Y[])
               callf()
               using $(Cache_module)
               struct Z end
-              @noinline g(dest) = push!(dest, Z())
+              @noinline g(dest) = @noinline push!(dest, Z())
               callg() = g(Z[])
               callg()
           end
@@ -1146,22 +1133,22 @@ precompile_test_harness("invoke") do dir
 
               call_getlast(x) = getlast(x)
 
-              # force precompilation
+              # force precompilation, force call so that inlining heuristics don't affect the result
               begin
                   Base.Experimental.@force_compile
-                  callf(3)
-                  callg(3)
-                  callh(3)
-                  callq(3)
-                  callqi(3)
-                  callfnc(3)
-                  callgnc(3)
-                  callhnc(3)
-                  callqnc(3)
-                  callqnci(3)
-                  internal(3)
-                  internalnc(3)
-                  call_getlast([1,2,3])
+                  @noinline callf(3)
+                  @noinline callg(3)
+                  @noinline callh(3)
+                  @noinline callq(3)
+                  @noinline callqi(3)
+                  @noinline callfnc(3)
+                  @noinline callgnc(3)
+                  @noinline callhnc(3)
+                  @noinline callqnc(3)
+                  @noinline callqnci(3)
+                  @noinline internal(3)
+                  @noinline internalnc(3)
+                  @noinline call_getlast([1,2,3])
               end
 
               # Now that we've precompiled, invalidate with a new method that overrides the `invoke` dispatch
@@ -1194,9 +1181,15 @@ precompile_test_harness("invoke") do dir
     for func in (M.f, M.g, M.internal, M.fnc, M.gnc, M.internalnc)
         m = get_method_for_type(func, Real)
         mi = m.specializations::Core.MethodInstance
-        @test length(mi.backedges) == 2
+        @test length(mi.backedges) == 2 || length(mi.backedges) == 4 # internalnc might have a constprop edge
         @test mi.backedges[1] === Tuple{typeof(func), Real}
         @test isa(mi.backedges[2], Core.CodeInstance)
+        if length(mi.backedges) == 4
+            @test mi.backedges[3] === Tuple{typeof(func), Real}
+            @test isa(mi.backedges[4], Core.CodeInstance)
+            @test mi.backedges[2] !== mi.backedges[4]
+            @test mi.backedges[2].def === mi.backedges[4].def
+        end
         @test mi.cache.max_world == typemax(mi.cache.max_world)
     end
     for func in (M.q, M.qnc)
@@ -1215,7 +1208,7 @@ precompile_test_harness("invoke") do dir
     m = only(methods(M.callq))
     @test nvalid(m.specializations::Core.MethodInstance) == 0
     m = only(methods(M.callqnc))
-    @test nvalid(m.specializations::Core.MethodInstance) == 0
+    @test nvalid(m.specializations::Core.MethodInstance) == 1
     m = only(methods(M.callqi))
     @test (m.specializations::Core.MethodInstance).specTypes == Tuple{typeof(M.callqi), Int}
     m = only(methods(M.callqnci))

From 987682318bc1bad53620904ecbc71d131fce31e5 Mon Sep 17 00:00:00 2001
From: Jameson Nash <vtjnash@gmail.com>
Date: Sat, 21 Dec 2024 03:09:51 +0000
Subject: [PATCH 5/5] delete unused code, so the jit no longer uses the
 inferred field at all

---
 src/codegen.cpp      | 20 ++++---------
 src/gf.c             | 70 +++++++++++++++++++++++++++++++++-----------
 src/jitlayers.cpp    | 68 +++++++++++-------------------------------
 src/julia_internal.h |  3 +-
 src/opaque_closure.c |  6 ++--
 5 files changed, 80 insertions(+), 87 deletions(-)

diff --git a/src/codegen.cpp b/src/codegen.cpp
index 04253b67d6b6d..540ee5ec74c4c 100644
--- a/src/codegen.cpp
+++ b/src/codegen.cpp
@@ -10050,29 +10050,19 @@ jl_llvm_functions_t jl_emit_codeinst(
 {
     JL_TIMING(CODEGEN, CODEGEN_Codeinst);
     jl_timing_show_method_instance(jl_get_ci_mi(codeinst), JL_TIMING_DEFAULT_BLOCK);
-    JL_GC_PUSH1(&src);
     if (!src) {
-        src = (jl_code_info_t*)jl_atomic_load_relaxed(&codeinst->inferred);
         jl_method_instance_t *mi = jl_get_ci_mi(codeinst);
-        jl_method_t *def = mi->def.method;
-        // Check if this is the generic method for opaque closure wrappers -
-        // if so, this must compile specptr such that it holds the specptr -> invoke wrapper
+        // Assert that this this is the generic method for opaque closure wrappers:
+        // this signals to instead compile specptr such that it holds the specptr -> invoke wrapper
         // to satisfy the dispatching implementation requirements of jl_f_opaque_closure_call
-        if (def == jl_opaque_closure_method) {
-            JL_GC_POP();
+        if (mi->def.method == jl_opaque_closure_method) {
             return jl_emit_oc_wrapper(m, params, mi, codeinst->rettype);
         }
-        if (src && (jl_value_t*)src != jl_nothing && jl_is_method(def))
-            src = jl_uncompress_ir(def, codeinst, (jl_value_t*)src);
-        if (!src || !jl_is_code_info(src)) {
-            JL_GC_POP();
-            m = orc::ThreadSafeModule();
-            return jl_llvm_functions_t(); // failed
-        }
+        m = orc::ThreadSafeModule();
+        return jl_llvm_functions_t(); // user error
     }
     //assert(jl_egal((jl_value_t*)jl_atomic_load_relaxed(&codeinst->debuginfo), (jl_value_t*)src->debuginfo) && "trying to generate code for a codeinst for an incompatible src");
     jl_llvm_functions_t decls = jl_emit_code(m, jl_get_ci_mi(codeinst), src, get_ci_abi(codeinst), params);
-    JL_GC_POP();
     return decls;
 }
 
diff --git a/src/gf.c b/src/gf.c
index 390ba72143f5d..48cb8003072ca 100644
--- a/src/gf.c
+++ b/src/gf.c
@@ -337,6 +337,59 @@ jl_datatype_t *jl_mk_builtin_func(jl_datatype_t *dt, const char *name, jl_fptr_a
     return dt;
 }
 
+// only relevant for bootstrapping. otherwise fairly broken.
+static int emit_codeinst_and_edges(jl_code_instance_t *codeinst)
+{
+    jl_value_t *code = jl_atomic_load_relaxed(&codeinst->inferred);
+    if (code) {
+        if (jl_atomic_load_relaxed(&codeinst->invoke) != NULL)
+            return 1;
+        if (code != jl_nothing) {
+            JL_GC_PUSH1(&code);
+            jl_method_instance_t *mi = jl_get_ci_mi(codeinst);
+            jl_method_t *def = mi->def.method;
+            if (jl_is_string(code) && jl_is_method(def))
+                code = (jl_value_t*)jl_uncompress_ir(def, codeinst, (jl_value_t*)code);
+            if (jl_is_code_info(code)) {
+                jl_emit_codeinst_to_jit(codeinst, (jl_code_info_t*)code);
+                if (0) {
+                    // next emit all the invoke edges too (if this seems profitable)
+                    jl_array_t *src = ((jl_code_info_t*)code)->code;
+                    for (size_t i = 0; i < jl_array_dim0(src); i++) {
+                        jl_value_t *stmt = jl_array_ptr_ref(src, i);
+                        if (jl_is_expr(stmt) && ((jl_expr_t*)stmt)->head == jl_assign_sym)
+                            stmt = jl_exprarg(stmt, 1);
+                        if (jl_is_expr(stmt) && ((jl_expr_t*)stmt)->head == jl_invoke_sym) {
+                            jl_value_t *invoke = jl_exprarg(stmt, 0);
+                            if (jl_is_code_instance(invoke))
+                                emit_codeinst_and_edges((jl_code_instance_t*)invoke);
+                        }
+                    }
+                }
+                JL_GC_POP();
+                return 1;
+            }
+            JL_GC_POP();
+        }
+    }
+    return 0;
+}
+
+// Opportunistic SOURCE_MODE_ABI cache lookup, only for bootstrapping.
+static jl_code_instance_t *jl_method_inferred_with_abi(jl_method_instance_t *mi JL_PROPAGATES_ROOT, size_t world)
+{
+    jl_code_instance_t *codeinst = jl_atomic_load_relaxed(&mi->cache);
+    for (; codeinst; codeinst = jl_atomic_load_relaxed(&codeinst->next)) {
+        if (codeinst->owner != jl_nothing)
+            continue;
+        if (jl_atomic_load_relaxed(&codeinst->min_world) <= world && world <= jl_atomic_load_relaxed(&codeinst->max_world)) {
+            if (emit_codeinst_and_edges(codeinst))
+                return codeinst;
+        }
+    }
+    return NULL;
+}
+
 // run type inference on lambda "mi" for given argument types.
 // returns the inferred source, and may cache the result in mi
 // if successful, also updates the mi argument to describe the validity of this src
@@ -2571,23 +2624,6 @@ jl_code_instance_t *jl_method_compiled(jl_method_instance_t *mi, size_t world)
     return NULL;
 }
 
-// Opportunistic SOURCE_MODE_ABI cache lookup.
-jl_code_instance_t *jl_method_inferred_with_abi(jl_method_instance_t *mi JL_PROPAGATES_ROOT, size_t world)
-{
-    jl_code_instance_t *codeinst = jl_atomic_load_relaxed(&mi->cache);
-    for (; codeinst; codeinst = jl_atomic_load_relaxed(&codeinst->next)) {
-        if (codeinst->owner != jl_nothing)
-            continue;
-
-        if (jl_atomic_load_relaxed(&codeinst->min_world) <= world && world <= jl_atomic_load_relaxed(&codeinst->max_world)) {
-            jl_value_t *code = jl_atomic_load_relaxed(&codeinst->inferred);
-            if (code && (code != jl_nothing || (jl_atomic_load_relaxed(&codeinst->invoke) != NULL)))
-                return codeinst;
-        }
-    }
-    return NULL;
-}
-
 jl_mutex_t precomp_statement_out_lock;
 
 _Atomic(uint8_t) jl_force_trace_compile_timing_enabled = 0;
diff --git a/src/jitlayers.cpp b/src/jitlayers.cpp
index e2bf75d346613..21d865891e45c 100644
--- a/src/jitlayers.cpp
+++ b/src/jitlayers.cpp
@@ -795,47 +795,6 @@ void jl_emit_codeinst_to_jit_impl(
     emittedmodules[codeinst] = std::move(result_m);
 }
 
-static void recursive_compile_graph(
-    jl_code_instance_t *codeinst,
-    jl_code_info_t *src)
-{
-    jl_emit_codeinst_to_jit(codeinst, src);
-    DenseSet<jl_code_instance_t*> Seen;
-    SmallVector<jl_code_instance_t*> workqueue;
-    workqueue.push_back(codeinst);
-    // if any edges were incomplete, try to complete them now
-    while (!workqueue.empty()) {
-        auto this_code = workqueue.pop_back_val();
-        if (Seen.insert(this_code).second) {
-            jl_code_instance_t *compiled_ci = jl_method_compiled_egal(codeinst);
-            if (!compiled_ci) {
-                if (this_code != codeinst) {
-                    JL_GC_PROMISE_ROOTED(this_code); // rooted transitively from following edges from original argument
-                    jl_emit_codeinst_to_jit(this_code, nullptr); // contains safepoints
-                }
-                jl_unique_gcsafe_lock lock(engine_lock);
-                auto edges = complete_graph.find(this_code);
-                if (edges != complete_graph.end()) {
-                    workqueue.append(edges->second);
-                }
-            }
-        }
-    }
-}
-
-// this generates llvm code for the lambda info
-// and adds the result to the jitlayers
-// (and the shadow module),
-// and generates code for it
-static void _jl_compile_codeinst(
-        jl_code_instance_t *codeinst,
-        jl_code_info_t *src)
-{
-    recursive_compile_graph(codeinst, src);
-    jl_compile_codeinst_now(codeinst);
-    assert(jl_is_compiled_codeinst(codeinst));
-}
-
 
 const char *jl_generate_ccallable(Module *llvmmod, void *sysimg_handle, jl_value_t *declrt, jl_value_t *sigt, jl_codegen_params_t &params);
 
@@ -859,7 +818,6 @@ int jl_compile_extern_c_impl(LLVMOrcThreadSafeModuleRef llvmmod, void *p, void *
     orc::ThreadSafeModule backing;
     bool success = true;
     const char *name = "";
-    SmallVector<jl_code_instance_t*,0> dependencies;
     if (into == NULL) {
         ctx = pparams ? pparams->tsctx : jl_ExecutionEngine->makeContext();
         backing = jl_create_ts_module("cextern", ctx, DL, TargetTriple);
@@ -887,11 +845,16 @@ int jl_compile_extern_c_impl(LLVMOrcThreadSafeModuleRef llvmmod, void *p, void *
             }
             params.tsctx_lock = params.tsctx.getLock(); // re-acquire lock
             if (success && params.cache) {
-                for (auto &it : params.workqueue) {
+                size_t newest_world = jl_atomic_load_acquire(&jl_world_counter);
+                for (auto &it : params.workqueue) { // really just zero or one, and just the ABI not the rest of the metadata
                     jl_code_instance_t *codeinst = it.first;
                     JL_GC_PROMISE_ROOTED(codeinst);
-                    dependencies.push_back(codeinst);
-                    recursive_compile_graph(codeinst, nullptr);
+                    jl_code_instance_t *newest_ci = jl_type_infer(jl_get_ci_mi(codeinst), newest_world, SOURCE_MODE_ABI);
+                    if (newest_ci) {
+                        if (jl_egal(codeinst->rettype, newest_ci->rettype))
+                            it.first = codeinst;
+                        jl_compile_codeinst_now(newest_ci);
+                    }
                 }
                 jl_analyze_workqueue(nullptr, params, true);
                 assert(params.workqueue.empty());
@@ -904,8 +867,6 @@ int jl_compile_extern_c_impl(LLVMOrcThreadSafeModuleRef llvmmod, void *p, void *
         { // lock scope
             jl_unique_gcsafe_lock lock(extern_c_lock);
             if (!jl_ExecutionEngine->getGlobalValueAddress(name)) {
-                for (auto dep : dependencies)
-                    jl_compile_codeinst_now(dep);
                 {
                     auto Lock = backing.getContext().getLock();
                     jl_ExecutionEngine->optimizeDLSyms(*backing.getModuleUnlocked()); // safepoint
@@ -976,7 +937,7 @@ int jl_compile_codeinst_impl(jl_code_instance_t *ci)
     if (!jl_is_compiled_codeinst(ci)) {
         ++SpecFPtrCount;
         uint64_t start = jl_typeinf_timing_begin();
-        _jl_compile_codeinst(ci, NULL);
+        jl_compile_codeinst_now(ci);
         jl_typeinf_timing_end(start, 0);
         newly_compiled = 1;
     }
@@ -1007,8 +968,7 @@ void jl_generate_fptr_for_unspecialized_impl(jl_code_instance_t *unspec)
     }
     else {
         jl_method_instance_t *mi = jl_get_ci_mi(unspec);
-        jl_code_instance_t *uninferred = jl_cached_uninferred(
-            jl_atomic_load_relaxed(&mi->cache), 1);
+        jl_code_instance_t *uninferred = jl_cached_uninferred(jl_atomic_load_relaxed(&mi->cache), 1);
         assert(uninferred);
         src = (jl_code_info_t*)jl_atomic_load_relaxed(&uninferred->inferred);
         assert(src);
@@ -1019,10 +979,16 @@ void jl_generate_fptr_for_unspecialized_impl(jl_code_instance_t *unspec)
         if (!jl_is_compiled_codeinst(unspec)) {
             assert(jl_is_code_info(src));
             ++UnspecFPtrCount;
+            jl_svec_t *edges = (jl_svec_t*)src->edges;
+            if (jl_is_svec(edges)) {
+                jl_atomic_store_release(&unspec->edges, edges); // n.b. this assumes the field was always empty svec(), which is not entirely true
+                jl_gc_wb(unspec, edges);
+            }
             jl_debuginfo_t *debuginfo = src->debuginfo;
             jl_atomic_store_release(&unspec->debuginfo, debuginfo); // n.b. this assumes the field was previously NULL, which is not entirely true
             jl_gc_wb(unspec, debuginfo);
-            _jl_compile_codeinst(unspec, src);
+            jl_emit_codeinst_to_jit(unspec, src);
+            jl_compile_codeinst_now(unspec);
         }
         JL_UNLOCK(&jitlock); // Might GC
     }
diff --git a/src/julia_internal.h b/src/julia_internal.h
index 7e91e23b9087d..00103e9b00a48 100644
--- a/src/julia_internal.h
+++ b/src/julia_internal.h
@@ -673,7 +673,7 @@ JL_DLLEXPORT void jl_engine_fulfill(jl_code_instance_t *ci, jl_code_info_t *src)
 void jl_engine_sweep(jl_ptls_t *gc_all_tls_states) JL_NOTSAFEPOINT;
 int jl_engine_hasreserved(jl_method_instance_t *m, jl_value_t *owner) JL_NOTSAFEPOINT;
 
-JL_DLLEXPORT jl_code_instance_t *jl_type_infer(jl_method_instance_t *li, size_t world, uint8_t source_mode);
+JL_DLLEXPORT jl_code_instance_t *jl_type_infer(jl_method_instance_t *li JL_PROPAGATES_ROOT, size_t world, uint8_t source_mode);
 JL_DLLEXPORT jl_code_info_t *jl_gdbcodetyped1(jl_method_instance_t *mi, size_t world);
 JL_DLLEXPORT jl_code_instance_t *jl_compile_method_internal(jl_method_instance_t *meth JL_PROPAGATES_ROOT, size_t world);
 JL_DLLEXPORT jl_code_instance_t *jl_get_method_inferred(
@@ -1210,7 +1210,6 @@ jl_method_instance_t *jl_get_specialized(jl_method_t *m, jl_value_t *types, jl_s
 JL_DLLEXPORT jl_value_t *jl_rettype_inferred(jl_value_t *owner, jl_method_instance_t *li JL_PROPAGATES_ROOT, size_t min_world, size_t max_world);
 JL_DLLEXPORT jl_value_t *jl_rettype_inferred_native(jl_method_instance_t *mi, size_t min_world, size_t max_world) JL_NOTSAFEPOINT;
 JL_DLLEXPORT jl_code_instance_t *jl_method_compiled(jl_method_instance_t *mi JL_PROPAGATES_ROOT, size_t world) JL_NOTSAFEPOINT;
-JL_DLLEXPORT jl_code_instance_t *jl_method_inferred_with_abi(jl_method_instance_t *mi JL_PROPAGATES_ROOT, size_t world) JL_NOTSAFEPOINT;
 JL_DLLEXPORT jl_value_t *jl_methtable_lookup(jl_methtable_t *mt JL_PROPAGATES_ROOT, jl_value_t *type, size_t world);
 JL_DLLEXPORT jl_method_instance_t *jl_specializations_get_linfo(
     jl_method_t *m JL_PROPAGATES_ROOT, jl_value_t *type, jl_svec_t *sparams);
diff --git a/src/opaque_closure.c b/src/opaque_closure.c
index 8a9dcac30a4f8..2d11d763be662 100644
--- a/src/opaque_closure.c
+++ b/src/opaque_closure.c
@@ -118,8 +118,10 @@ static jl_opaque_closure_t *new_opaque_closure(jl_tupletype_t *argt, jl_value_t
 
         // OC wrapper methods are not world dependent and have no edges or other info
         ci = jl_get_method_inferred(mi_generic, selected_rt, 1, ~(size_t)0, NULL, NULL);
-        if (!jl_atomic_load_acquire(&ci->invoke))
-            jl_compile_codeinst(ci); // confusing this actually calls jl_emit_oc_wrapper and never actually compiles ci (which would be impossible since it cannot have source)
+        if (!jl_atomic_load_acquire(&ci->invoke)) {
+            jl_emit_codeinst_to_jit(ci, NULL); // confusing this actually calls jl_emit_oc_wrapper and never actually compiles ci (which would be impossible since it cannot have source)
+            jl_compile_codeinst(ci);
+        }
         specptr = jl_atomic_load_relaxed(&ci->specptr.fptr);
     }
     jl_opaque_closure_t *oc = (jl_opaque_closure_t*)jl_gc_alloc(ct->ptls, sizeof(jl_opaque_closure_t), oc_type);