GC: report times spent in each step of sweeping (#56993)

We've been using this patch for a while (e.g., RelationalAI#176), and it has proven valuable for identifying the most expensive part of sweeping in a customer workload. Specifically, it highlighted that the madvise stage was the bottleneck. These metrics allowed us to effectively determine whether concurrent page sweeping would be beneficial.
JuliaLang · Jan 8, 2025 · 11ce171 · 11ce171
1 parent 0868bec
commit 11ce171
Show file tree

Hide file tree

Showing 3 changed files with 60 additions and 43 deletions.
diff --git a/base/timing.jl b/base/timing.jl
@@ -24,6 +24,9 @@ struct GC_Num
     mark_time       ::Int64
     stack_pool_sweep_time ::Int64
     total_sweep_time  ::Int64
+    total_sweep_page_walk_time              ::Int64
+    total_sweep_madvise_time                ::Int64
+    total_sweep_free_mallocd_memory_time    ::Int64
     total_mark_time   ::Int64
     total_stack_pool_sweep_time::Int64
     last_full_sweep ::Int64

diff --git a/src/gc-interface.h b/src/gc-interface.h
@@ -46,6 +46,9 @@ typedef struct {
     uint64_t mark_time;
     uint64_t stack_pool_sweep_time;
     uint64_t total_sweep_time;
+    uint64_t    total_sweep_page_walk_time;
+    uint64_t    total_sweep_madvise_time;
+    uint64_t    total_sweep_free_mallocd_memory_time;
     uint64_t total_mark_time;
     uint64_t total_stack_pool_sweep_time;
     uint64_t last_full_sweep;

diff --git a/src/gc-stock.c b/src/gc-stock.c
@@ -984,9 +984,12 @@ STATIC_INLINE void gc_sweep_pool_page(gc_page_profiler_serializer_t *s, jl_gc_pa
 // sweep over all memory that is being used and not in a pool
 static void gc_sweep_other(jl_ptls_t ptls, int sweep_full) JL_NOTSAFEPOINT
 {
+    uint64_t t_free_mallocd_memory_start = jl_hrtime();
     gc_sweep_foreign_objs();
     sweep_malloced_memory();
     sweep_big(ptls);
+    uint64_t t_free_mallocd_memory_end = jl_hrtime();
+    gc_num.total_sweep_free_mallocd_memory_time += t_free_mallocd_memory_end - t_free_mallocd_memory_start;
     jl_engine_sweep(gc_all_tls_states);
 }
 
@@ -1380,66 +1383,74 @@ static void gc_sweep_pool(void)
         }
     }
 
-    // the actual sweeping
-    jl_gc_padded_page_stack_t *new_gc_allocd_scratch = (jl_gc_padded_page_stack_t *) calloc_s(n_threads * sizeof(jl_gc_padded_page_stack_t));
-    jl_ptls_t ptls = jl_current_task->ptls;
-    gc_sweep_wake_all_pages(ptls, new_gc_allocd_scratch);
-    gc_sweep_pool_parallel(ptls);
-    gc_sweep_wait_for_all_pages();
-
-    // reset half-pages pointers
-    for (int t_i = 0; t_i < n_threads; t_i++) {
-        jl_ptls_t ptls2 = gc_all_tls_states[t_i];
-        if (ptls2 != NULL) {
-            ptls2->gc_tls.page_metadata_allocd = new_gc_allocd_scratch[t_i].stack;
-            for (int i = 0; i < JL_GC_N_POOLS; i++) {
-                jl_gc_pool_t *p = &ptls2->gc_tls.heap.norm_pools[i];
-                p->newpages = NULL;
+    uint64_t t_page_walk_start = jl_hrtime();
+    {
+        // the actual sweeping
+        jl_gc_padded_page_stack_t *new_gc_allocd_scratch = (jl_gc_padded_page_stack_t *) calloc_s(n_threads * sizeof(jl_gc_padded_page_stack_t));
+        jl_ptls_t ptls = jl_current_task->ptls;
+        gc_sweep_wake_all_pages(ptls, new_gc_allocd_scratch);
+        gc_sweep_pool_parallel(ptls);
+        gc_sweep_wait_for_all_pages();
+
+        // reset half-pages pointers
+        for (int t_i = 0; t_i < n_threads; t_i++) {
+            jl_ptls_t ptls2 = gc_all_tls_states[t_i];
+            if (ptls2 != NULL) {
+                ptls2->gc_tls.page_metadata_allocd = new_gc_allocd_scratch[t_i].stack;
+                for (int i = 0; i < JL_GC_N_POOLS; i++) {
+                    jl_gc_pool_t *p = &ptls2->gc_tls.heap.norm_pools[i];
+                    p->newpages = NULL;
+                }
             }
         }
-    }
 
-    // merge free lists
-    for (int t_i = 0; t_i < n_threads; t_i++) {
-        jl_ptls_t ptls2 = gc_all_tls_states[t_i];
-        if (ptls2 == NULL) {
-            continue;
-        }
-        jl_gc_pagemeta_t *pg = jl_atomic_load_relaxed(&ptls2->gc_tls.page_metadata_allocd.bottom);
-        while (pg != NULL) {
-            jl_gc_pagemeta_t *pg2 = pg->next;
-            if (pg->fl_begin_offset != UINT16_MAX) {
-                char *cur_pg = pg->data;
-                jl_taggedvalue_t *fl_beg = (jl_taggedvalue_t*)(cur_pg + pg->fl_begin_offset);
-                jl_taggedvalue_t *fl_end = (jl_taggedvalue_t*)(cur_pg + pg->fl_end_offset);
-                *pfl[t_i * JL_GC_N_POOLS + pg->pool_n] = fl_beg;
-                pfl[t_i * JL_GC_N_POOLS + pg->pool_n] = &fl_end->next;
+        // merge free lists
+        for (int t_i = 0; t_i < n_threads; t_i++) {
+            jl_ptls_t ptls2 = gc_all_tls_states[t_i];
+            if (ptls2 == NULL) {
+                continue;
+            }
+            jl_gc_pagemeta_t *pg = jl_atomic_load_relaxed(&ptls2->gc_tls.page_metadata_allocd.bottom);
+            while (pg != NULL) {
+                jl_gc_pagemeta_t *pg2 = pg->next;
+                if (pg->fl_begin_offset != UINT16_MAX) {
+                    char *cur_pg = pg->data;
+                    jl_taggedvalue_t *fl_beg = (jl_taggedvalue_t*)(cur_pg + pg->fl_begin_offset);
+                    jl_taggedvalue_t *fl_end = (jl_taggedvalue_t*)(cur_pg + pg->fl_end_offset);
+                    *pfl[t_i * JL_GC_N_POOLS + pg->pool_n] = fl_beg;
+                    pfl[t_i * JL_GC_N_POOLS + pg->pool_n] = &fl_end->next;
+                }
+                pg = pg2;
             }
-            pg = pg2;
         }
-    }
 
-    // null out terminal pointers of free lists
-    for (int t_i = 0; t_i < n_threads; t_i++) {
-        jl_ptls_t ptls2 = gc_all_tls_states[t_i];
-        if (ptls2 != NULL) {
-            for (int i = 0; i < JL_GC_N_POOLS; i++) {
-                *pfl[t_i * JL_GC_N_POOLS + i] = NULL;
+        // null out terminal pointers of free lists
+        for (int t_i = 0; t_i < n_threads; t_i++) {
+            jl_ptls_t ptls2 = gc_all_tls_states[t_i];
+            if (ptls2 != NULL) {
+                for (int i = 0; i < JL_GC_N_POOLS; i++) {
+                    *pfl[t_i * JL_GC_N_POOLS + i] = NULL;
+                }
             }
         }
-    }
 
-    // cleanup
-    free(pfl);
-    free(new_gc_allocd_scratch);
+        // cleanup
+        free(pfl);
+        free(new_gc_allocd_scratch);
+    }
+    uint64_t t_page_walk_end = jl_hrtime();
+    gc_num.total_sweep_page_walk_time += t_page_walk_end - t_page_walk_start;
 
 #ifdef _P64 // only enable concurrent sweeping on 64bit
     // wake thread up to sweep concurrently
     if (jl_n_sweepthreads > 0) {
         uv_sem_post(&gc_sweep_assists_needed);
     }
     else {
+        uint64_t t_madvise_start = jl_hrtime();
         gc_free_pages();
+        uint64_t t_madvise_end = jl_hrtime();
+        gc_num.total_sweep_madvise_time += t_madvise_end - t_madvise_start;
     }
 #else
     gc_free_pages();