Skip to content

Commit

Permalink
GC: report times spent in each step of sweeping (#56993)
Browse files Browse the repository at this point in the history
We've been using this patch for a while (e.g.,
RelationalAI#176), and it has proven
valuable for identifying the most expensive part of sweeping in a
customer workload. Specifically, it highlighted that the madvise stage
was the bottleneck. These metrics allowed us to effectively determine
whether concurrent page sweeping would be beneficial.
  • Loading branch information
d-netto authored Jan 8, 2025
1 parent 0868bec commit 11ce171
Show file tree
Hide file tree
Showing 3 changed files with 60 additions and 43 deletions.
3 changes: 3 additions & 0 deletions base/timing.jl
Original file line number Diff line number Diff line change
Expand Up @@ -24,6 +24,9 @@ struct GC_Num
mark_time ::Int64
stack_pool_sweep_time ::Int64
total_sweep_time ::Int64
total_sweep_page_walk_time ::Int64
total_sweep_madvise_time ::Int64
total_sweep_free_mallocd_memory_time ::Int64
total_mark_time ::Int64
total_stack_pool_sweep_time::Int64
last_full_sweep ::Int64
Expand Down
3 changes: 3 additions & 0 deletions src/gc-interface.h
Original file line number Diff line number Diff line change
Expand Up @@ -46,6 +46,9 @@ typedef struct {
uint64_t mark_time;
uint64_t stack_pool_sweep_time;
uint64_t total_sweep_time;
uint64_t total_sweep_page_walk_time;
uint64_t total_sweep_madvise_time;
uint64_t total_sweep_free_mallocd_memory_time;
uint64_t total_mark_time;
uint64_t total_stack_pool_sweep_time;
uint64_t last_full_sweep;
Expand Down
97 changes: 54 additions & 43 deletions src/gc-stock.c
Original file line number Diff line number Diff line change
Expand Up @@ -984,9 +984,12 @@ STATIC_INLINE void gc_sweep_pool_page(gc_page_profiler_serializer_t *s, jl_gc_pa
// sweep over all memory that is being used and not in a pool
static void gc_sweep_other(jl_ptls_t ptls, int sweep_full) JL_NOTSAFEPOINT
{
uint64_t t_free_mallocd_memory_start = jl_hrtime();
gc_sweep_foreign_objs();
sweep_malloced_memory();
sweep_big(ptls);
uint64_t t_free_mallocd_memory_end = jl_hrtime();
gc_num.total_sweep_free_mallocd_memory_time += t_free_mallocd_memory_end - t_free_mallocd_memory_start;
jl_engine_sweep(gc_all_tls_states);
}

Expand Down Expand Up @@ -1380,66 +1383,74 @@ static void gc_sweep_pool(void)
}
}

// the actual sweeping
jl_gc_padded_page_stack_t *new_gc_allocd_scratch = (jl_gc_padded_page_stack_t *) calloc_s(n_threads * sizeof(jl_gc_padded_page_stack_t));
jl_ptls_t ptls = jl_current_task->ptls;
gc_sweep_wake_all_pages(ptls, new_gc_allocd_scratch);
gc_sweep_pool_parallel(ptls);
gc_sweep_wait_for_all_pages();

// reset half-pages pointers
for (int t_i = 0; t_i < n_threads; t_i++) {
jl_ptls_t ptls2 = gc_all_tls_states[t_i];
if (ptls2 != NULL) {
ptls2->gc_tls.page_metadata_allocd = new_gc_allocd_scratch[t_i].stack;
for (int i = 0; i < JL_GC_N_POOLS; i++) {
jl_gc_pool_t *p = &ptls2->gc_tls.heap.norm_pools[i];
p->newpages = NULL;
uint64_t t_page_walk_start = jl_hrtime();
{
// the actual sweeping
jl_gc_padded_page_stack_t *new_gc_allocd_scratch = (jl_gc_padded_page_stack_t *) calloc_s(n_threads * sizeof(jl_gc_padded_page_stack_t));
jl_ptls_t ptls = jl_current_task->ptls;
gc_sweep_wake_all_pages(ptls, new_gc_allocd_scratch);
gc_sweep_pool_parallel(ptls);
gc_sweep_wait_for_all_pages();

// reset half-pages pointers
for (int t_i = 0; t_i < n_threads; t_i++) {
jl_ptls_t ptls2 = gc_all_tls_states[t_i];
if (ptls2 != NULL) {
ptls2->gc_tls.page_metadata_allocd = new_gc_allocd_scratch[t_i].stack;
for (int i = 0; i < JL_GC_N_POOLS; i++) {
jl_gc_pool_t *p = &ptls2->gc_tls.heap.norm_pools[i];
p->newpages = NULL;
}
}
}
}

// merge free lists
for (int t_i = 0; t_i < n_threads; t_i++) {
jl_ptls_t ptls2 = gc_all_tls_states[t_i];
if (ptls2 == NULL) {
continue;
}
jl_gc_pagemeta_t *pg = jl_atomic_load_relaxed(&ptls2->gc_tls.page_metadata_allocd.bottom);
while (pg != NULL) {
jl_gc_pagemeta_t *pg2 = pg->next;
if (pg->fl_begin_offset != UINT16_MAX) {
char *cur_pg = pg->data;
jl_taggedvalue_t *fl_beg = (jl_taggedvalue_t*)(cur_pg + pg->fl_begin_offset);
jl_taggedvalue_t *fl_end = (jl_taggedvalue_t*)(cur_pg + pg->fl_end_offset);
*pfl[t_i * JL_GC_N_POOLS + pg->pool_n] = fl_beg;
pfl[t_i * JL_GC_N_POOLS + pg->pool_n] = &fl_end->next;
// merge free lists
for (int t_i = 0; t_i < n_threads; t_i++) {
jl_ptls_t ptls2 = gc_all_tls_states[t_i];
if (ptls2 == NULL) {
continue;
}
jl_gc_pagemeta_t *pg = jl_atomic_load_relaxed(&ptls2->gc_tls.page_metadata_allocd.bottom);
while (pg != NULL) {
jl_gc_pagemeta_t *pg2 = pg->next;
if (pg->fl_begin_offset != UINT16_MAX) {
char *cur_pg = pg->data;
jl_taggedvalue_t *fl_beg = (jl_taggedvalue_t*)(cur_pg + pg->fl_begin_offset);
jl_taggedvalue_t *fl_end = (jl_taggedvalue_t*)(cur_pg + pg->fl_end_offset);
*pfl[t_i * JL_GC_N_POOLS + pg->pool_n] = fl_beg;
pfl[t_i * JL_GC_N_POOLS + pg->pool_n] = &fl_end->next;
}
pg = pg2;
}
pg = pg2;
}
}

// null out terminal pointers of free lists
for (int t_i = 0; t_i < n_threads; t_i++) {
jl_ptls_t ptls2 = gc_all_tls_states[t_i];
if (ptls2 != NULL) {
for (int i = 0; i < JL_GC_N_POOLS; i++) {
*pfl[t_i * JL_GC_N_POOLS + i] = NULL;
// null out terminal pointers of free lists
for (int t_i = 0; t_i < n_threads; t_i++) {
jl_ptls_t ptls2 = gc_all_tls_states[t_i];
if (ptls2 != NULL) {
for (int i = 0; i < JL_GC_N_POOLS; i++) {
*pfl[t_i * JL_GC_N_POOLS + i] = NULL;
}
}
}
}

// cleanup
free(pfl);
free(new_gc_allocd_scratch);
// cleanup
free(pfl);
free(new_gc_allocd_scratch);
}
uint64_t t_page_walk_end = jl_hrtime();
gc_num.total_sweep_page_walk_time += t_page_walk_end - t_page_walk_start;

#ifdef _P64 // only enable concurrent sweeping on 64bit
// wake thread up to sweep concurrently
if (jl_n_sweepthreads > 0) {
uv_sem_post(&gc_sweep_assists_needed);
}
else {
uint64_t t_madvise_start = jl_hrtime();
gc_free_pages();
uint64_t t_madvise_end = jl_hrtime();
gc_num.total_sweep_madvise_time += t_madvise_end - t_madvise_start;
}
#else
gc_free_pages();
Expand Down

0 comments on commit 11ce171

Please sign in to comment.