Skip to content

Commit

Permalink
cat: optimize data cache benchmarks
Browse files Browse the repository at this point in the history
Traverse the pointer chain fewer times for larger buffers and do a
warm-up traversal closer to PAPI_start() to prevent cache pollution and
compulsory misses.

These changes have been tested on the Intel Sapphire Rapids
architecture.
  • Loading branch information
dbarry9 committed Jan 12, 2024
1 parent 4f3cbc2 commit e235194
Show file tree
Hide file tree
Showing 3 changed files with 37 additions and 25 deletions.
14 changes: 8 additions & 6 deletions src/counter_analysis_toolkit/dcache.c
Original file line number Diff line number Diff line change
Expand Up @@ -270,7 +270,7 @@ int varyBufferSizes(long long *values, double **rslts, double **counter, hw_desc
}

// Make a cold run
out = probeBufferSize(16LL*stride, stride, pages_per_block, pattern, v, &rslt, latency_only, mode, ONT);
out = probeBufferSize(16LL*stride, stride, pages_per_block, pattern, max_size, v, &rslt, latency_only, mode, ONT);
if(out.status != 0)
goto error;

Expand All @@ -279,7 +279,7 @@ int varyBufferSizes(long long *values, double **rslts, double **counter, hw_desc
cnt = 0;
// If we don't know the cache sizes, space the measurements between two default values.
for(active_buf_len=min_size; active_buf_len<max_size; active_buf_len*=2){
out = probeBufferSize(active_buf_len, stride, pages_per_block, pattern, v, &rslt, latency_only, mode, ONT);
out = probeBufferSize(active_buf_len, stride, pages_per_block, pattern, max_size, v, &rslt, latency_only, mode, ONT);
if(out.status != 0)
goto error;
for(k = 0; k < ONT; ++k) {
Expand All @@ -288,7 +288,7 @@ int varyBufferSizes(long long *values, double **rslts, double **counter, hw_desc
}
values[cnt++] = ONT*sizeof(uintptr_t)*active_buf_len;

out = probeBufferSize((long long)((double)active_buf_len*1.25), stride, pages_per_block, pattern, v, &rslt, latency_only, mode, ONT);
out = probeBufferSize((long long)((double)active_buf_len*1.25), stride, pages_per_block, pattern, max_size, v, &rslt, latency_only, mode, ONT);
if(out.status != 0)
goto error;
for(k = 0; k < ONT; ++k) {
Expand All @@ -297,7 +297,7 @@ int varyBufferSizes(long long *values, double **rslts, double **counter, hw_desc
}
values[cnt++] = ONT*sizeof(uintptr_t)*((long long)((double)active_buf_len*1.25));

out = probeBufferSize((long long)((double)active_buf_len*1.5), stride, pages_per_block, pattern, v, &rslt, latency_only, mode, ONT);
out = probeBufferSize((long long)((double)active_buf_len*1.5), stride, pages_per_block, pattern, max_size, v, &rslt, latency_only, mode, ONT);
if(out.status != 0)
goto error;
for(k = 0; k < ONT; ++k) {
Expand All @@ -306,7 +306,7 @@ int varyBufferSizes(long long *values, double **rslts, double **counter, hw_desc
}
values[cnt++] = ONT*sizeof(uintptr_t)*((long long)((double)active_buf_len*1.5));

out = probeBufferSize((long long)((double)active_buf_len*1.75), stride, pages_per_block, pattern, v, &rslt, latency_only, mode, ONT);
out = probeBufferSize((long long)((double)active_buf_len*1.75), stride, pages_per_block, pattern, max_size, v, &rslt, latency_only, mode, ONT);
if(out.status != 0)
goto error;
for(k = 0; k < ONT; ++k) {
Expand Down Expand Up @@ -378,7 +378,9 @@ int varyBufferSizes(long long *values, double **rslts, double **counter, hw_desc
cnt=0;
for(j=0; j<len; j++){
active_buf_len = bufSizes[j]/sizeof(uintptr_t);
out = probeBufferSize(active_buf_len, stride, pages_per_block, pattern, v, &rslt, latency_only, mode, ONT);
long long llc_size = hw_desc->dcache_size[llc_idx]/hw_desc->split[llc_idx];
llc_size /= sizeof(uintptr_t);
out = probeBufferSize(active_buf_len, stride, pages_per_block, pattern, llc_size, v, &rslt, latency_only, mode, ONT);
if(out.status != 0)
goto error;
for(k = 0; k < ONT; ++k) {
Expand Down
46 changes: 28 additions & 18 deletions src/counter_analysis_toolkit/timing_kernels.c
Original file line number Diff line number Diff line change
Expand Up @@ -14,13 +14,13 @@ volatile double x,y;
extern int is_core;
char* eventname = NULL;

run_output_t probeBufferSize(long long active_buf_len, long long line_size, float pageCountPerBlock, int pattern, uintptr_t **v, uintptr_t *rslt, int latency_only, int mode, int ONT){
run_output_t probeBufferSize(long long active_buf_len, long long line_size, float pageCountPerBlock, int pattern, long long llc_size, uintptr_t **v, uintptr_t *rslt, int latency_only, int mode, int ONT){
int _papi_eventset = PAPI_NULL;
int retval, buffer = 0, status = 0;
int error_line = -1, error_type = PAPI_OK;
register uintptr_t *p = NULL;
register uintptr_t p_prime;
long long count, pageSize, blockSize;
long long pageSize, blockSize;
long long int counter[ONT];
run_output_t out;
out.status = 0;
Expand All @@ -33,13 +33,15 @@ run_output_t probeBufferSize(long long active_buf_len, long long line_size, floa
if( x > 0 || y > 0 )
printf("WARNING: x=%lf y=%lf\n",x,y);

// Make no fewer accesses than we would for a buffer of size 128KB.
long long countSingle = active_buf_len/line_size;
long long threshold = 1024LL*1024LL/64LL;
long long len = (active_buf_len > threshold) ? active_buf_len : threshold;
long long countMax;
long long unsigned threshold = 128*1024;
if( active_buf_len*sizeof(uintptr_t) > threshold )
countMax = 64LL*((long long)(active_buf_len/line_size));
else
countMax = 64LL*((long long)(threshold/line_size));
if( len > llc_size ){
countMax = 4LL*(len/line_size);
}else{
countMax = 64LL*(len/line_size);
}

// Get the size of a page of memory.
pageSize = sysconf(_SC_PAGESIZE)/sizeof(uintptr_t);
Expand All @@ -51,31 +53,27 @@ run_output_t probeBufferSize(long long active_buf_len, long long line_size, floa

// Compute the size of a block in the pointer chain and create the pointer chain.
blockSize = (long long)(pageCountPerBlock*(float)pageSize);
#pragma omp parallel reduction(+:status) default(shared)
{
int idx = omp_get_thread_num();

status += prepareArray(v[idx], active_buf_len, line_size, blockSize, pattern);
}

// Start of threaded benchmark.
#pragma omp parallel private(p,count,retval) reduction(+:buffer) reduction(+:status) firstprivate(_papi_eventset) default(shared)
#pragma omp parallel private(p,retval) reduction(+:buffer) reduction(+:status) firstprivate(_papi_eventset) default(shared)
{
int idx = omp_get_thread_num();
int thdStatus = 0;
double divisor = 1.0;
double time1=0, time2=0, dt, factor;
long long count;

// Initialize the result to a value indicating an error.
// If no error occurs, it will be overwritten.
if ( !latency_only ) {
out.counter[idx] = -1;
}

status += prepareArray(v[idx], active_buf_len, line_size, blockSize, pattern);

// We will use "p" even after the epilogue, so let's set
// it here in case an error occurs.
p = &v[idx][0];
count = countMax;

if ( !latency_only && (is_core || 0 == idx) ) {
retval = PAPI_create_eventset( &_papi_eventset );
Expand All @@ -95,8 +93,20 @@ run_output_t probeBufferSize(long long active_buf_len, long long line_size, floa
// If we can't measure events, no need to run the kernel.
goto clean_up;
}
}

// Make sure all threads start at about the same time so that we get high pressure on the memory subsystem.
#pragma omp barrier

count = countSingle;
// Make a warm-up pass to fetch the data into the cache, if it fits in any cache.
while(count > 0){
N_128;
count -= 128;
}

// Start the counters.
// Start the counters.
if ( !latency_only && (is_core || 0 == idx) ) {
retval = PAPI_start(_papi_eventset);
if ( PAPI_OK != retval ) {
error_type = retval;
Expand All @@ -108,7 +118,7 @@ run_output_t probeBufferSize(long long active_buf_len, long long line_size, floa
}

// Start the actual test.

count = countMax;
// Micro-kernel for memory reading.
if( CACHE_READ_ONLY == mode || latency_only )
{
Expand Down
2 changes: 1 addition & 1 deletion src/counter_analysis_toolkit/timing_kernels.h
Original file line number Diff line number Diff line change
Expand Up @@ -27,7 +27,7 @@
#define CACHE_READ_ONLY 0x0
#define CACHE_READ_WRITE 0x1

run_output_t probeBufferSize(long long active_buf_len, long long line_size, float pageCountPerBlock, int pattern, uintptr_t **v, uintptr_t *rslt, int detect_size, int mode, int ONT);
run_output_t probeBufferSize(long long active_buf_len, long long line_size, float pageCountPerBlock, int pattern, long long llc_size, uintptr_t **v, uintptr_t *rslt, int detect_size, int mode, int ONT);
void error_handler(int e, int line);

#endif

0 comments on commit e235194

Please sign in to comment.