Skip to content

Commit

Permalink
Merge pull request #150 from dbarry9/2024.01.12_cat-dcache
Browse files Browse the repository at this point in the history
2024.01.12 CAT data cache benchmark improvements
  • Loading branch information
adanalis authored Jan 12, 2024
2 parents 4f3cbc2 + c51c62e commit 980322d
Show file tree
Hide file tree
Showing 5 changed files with 56 additions and 34 deletions.
34 changes: 23 additions & 11 deletions src/counter_analysis_toolkit/dcache.c
Original file line number Diff line number Diff line change
Expand Up @@ -78,7 +78,7 @@ void d_cache_driver(char* papi_event_name, cat_params_t params, hw_desc_t *hw_de
printf("%3d%%\b\b\b\b",(100*test_cnt++)/6);
fflush(stdout);
}
status = d_cache_test(pattern, params.max_iter, hw_desc, stride, ppb, papi_event_name, latency_only, mode, ofp_papi);
status = d_cache_test(pattern, params, hw_desc, stride, ppb, papi_event_name, latency_only, mode, ofp_papi);
if( status < 0 )
goto error2;
}
Expand All @@ -90,7 +90,7 @@ void d_cache_driver(char* papi_event_name, cat_params_t params, hw_desc_t *hw_de
printf("%3d%%\b\b\b\b",(100*test_cnt++)/6);
fflush(stdout);
}
status = d_cache_test(pattern, params.max_iter, hw_desc, stride, ppb, papi_event_name, latency_only, mode, ofp_papi);
status = d_cache_test(pattern, params, hw_desc, stride, ppb, papi_event_name, latency_only, mode, ofp_papi);
if( status < 0 )
goto error2;
}
Expand All @@ -115,12 +115,13 @@ void d_cache_driver(char* papi_event_name, cat_params_t params, hw_desc_t *hw_de
return;
}

int d_cache_test(int pattern, int max_iter, hw_desc_t *hw_desc, long long stride_in_bytes, float pages_per_block, char* papi_event_name, int latency_only, int mode, FILE* ofp){
int d_cache_test(int pattern, cat_params_t params, hw_desc_t *hw_desc, long long stride_in_bytes, float pages_per_block, char* papi_event_name, int latency_only, int mode, FILE* ofp){
int i,j,k;
long long *values;
double ***rslts, *sorted_rslts;
double ***counter, *sorted_counter;
int status=0, guessCount, ONT;
int max_iter = params.max_iter;

min_size = 2*1024/sizeof(uintptr_t); // 2KB
max_size = 1024*1024*1024/sizeof(uintptr_t);// 1GB
Expand Down Expand Up @@ -175,7 +176,7 @@ int d_cache_test(int pattern, int max_iter, hw_desc_t *hw_desc, long long stride
eventname = papi_event_name;

for(i=0; i<max_iter; ++i){
status = varyBufferSizes(values, rslts[i], counter[i], hw_desc, stride_in_bytes, pages_per_block, pattern, latency_only, mode, ONT);
status = varyBufferSizes(values, rslts[i], counter[i], params, hw_desc, stride_in_bytes, pages_per_block, pattern, latency_only, mode, ONT);
if( status < 0 )
goto cleanup;
}
Expand Down Expand Up @@ -231,7 +232,7 @@ int d_cache_test(int pattern, int max_iter, hw_desc_t *hw_desc, long long stride
}


int varyBufferSizes(long long *values, double **rslts, double **counter, hw_desc_t *hw_desc, long long stride_in_bytes, float pages_per_block, int pattern, int latency_only, int mode, int ONT){
int varyBufferSizes(long long *values, double **rslts, double **counter, cat_params_t params, hw_desc_t *hw_desc, long long stride_in_bytes, float pages_per_block, int pattern, int latency_only, int mode, int ONT){
long long i;
int j, k, cnt;
long long active_buf_len;
Expand Down Expand Up @@ -270,7 +271,7 @@ int varyBufferSizes(long long *values, double **rslts, double **counter, hw_desc
}

// Make a cold run
out = probeBufferSize(16LL*stride, stride, pages_per_block, pattern, v, &rslt, latency_only, mode, ONT);
out = probeBufferSize(16LL*stride, stride, pages_per_block, pattern, max_size, v, &rslt, latency_only, mode, ONT);
if(out.status != 0)
goto error;

Expand All @@ -279,7 +280,7 @@ int varyBufferSizes(long long *values, double **rslts, double **counter, hw_desc
cnt = 0;
// If we don't know the cache sizes, space the measurements between two default values.
for(active_buf_len=min_size; active_buf_len<max_size; active_buf_len*=2){
out = probeBufferSize(active_buf_len, stride, pages_per_block, pattern, v, &rslt, latency_only, mode, ONT);
out = probeBufferSize(active_buf_len, stride, pages_per_block, pattern, max_size, v, &rslt, latency_only, mode, ONT);
if(out.status != 0)
goto error;
for(k = 0; k < ONT; ++k) {
Expand All @@ -288,7 +289,7 @@ int varyBufferSizes(long long *values, double **rslts, double **counter, hw_desc
}
values[cnt++] = ONT*sizeof(uintptr_t)*active_buf_len;

out = probeBufferSize((long long)((double)active_buf_len*1.25), stride, pages_per_block, pattern, v, &rslt, latency_only, mode, ONT);
out = probeBufferSize((long long)((double)active_buf_len*1.25), stride, pages_per_block, pattern, max_size, v, &rslt, latency_only, mode, ONT);
if(out.status != 0)
goto error;
for(k = 0; k < ONT; ++k) {
Expand All @@ -297,7 +298,7 @@ int varyBufferSizes(long long *values, double **rslts, double **counter, hw_desc
}
values[cnt++] = ONT*sizeof(uintptr_t)*((long long)((double)active_buf_len*1.25));

out = probeBufferSize((long long)((double)active_buf_len*1.5), stride, pages_per_block, pattern, v, &rslt, latency_only, mode, ONT);
out = probeBufferSize((long long)((double)active_buf_len*1.5), stride, pages_per_block, pattern, max_size, v, &rslt, latency_only, mode, ONT);
if(out.status != 0)
goto error;
for(k = 0; k < ONT; ++k) {
Expand All @@ -306,7 +307,7 @@ int varyBufferSizes(long long *values, double **rslts, double **counter, hw_desc
}
values[cnt++] = ONT*sizeof(uintptr_t)*((long long)((double)active_buf_len*1.5));

out = probeBufferSize((long long)((double)active_buf_len*1.75), stride, pages_per_block, pattern, v, &rslt, latency_only, mode, ONT);
out = probeBufferSize((long long)((double)active_buf_len*1.75), stride, pages_per_block, pattern, max_size, v, &rslt, latency_only, mode, ONT);
if(out.status != 0)
goto error;
for(k = 0; k < ONT; ++k) {
Expand Down Expand Up @@ -377,8 +378,15 @@ int varyBufferSizes(long long *values, double **rslts, double **counter, hw_desc

cnt=0;
for(j=0; j<len; j++){
char symbol[4] = "|/-\\";
active_buf_len = bufSizes[j]/sizeof(uintptr_t);
out = probeBufferSize(active_buf_len, stride, pages_per_block, pattern, v, &rslt, latency_only, mode, ONT);
if( params.show_progress ){
printf("%c\b",symbol[j%4]);
fflush(stdout);
}
long long llc_size = hw_desc->dcache_size[llc_idx]/hw_desc->split[llc_idx];
llc_size /= sizeof(uintptr_t);
out = probeBufferSize(active_buf_len, stride, pages_per_block, pattern, llc_size, v, &rslt, latency_only, mode, ONT);
if(out.status != 0)
goto error;
for(k = 0; k < ONT; ++k) {
Expand All @@ -387,6 +395,10 @@ int varyBufferSizes(long long *values, double **rslts, double **counter, hw_desc
}
values[cnt++] = bufSizes[j];
}
if( params.show_progress ){
printf(" \b");
fflush(stdout);
}

free(bufSizes);
}
Expand Down
4 changes: 2 additions & 2 deletions src/counter_analysis_toolkit/dcache.h
Original file line number Diff line number Diff line change
Expand Up @@ -8,9 +8,9 @@

#define FACTOR 12LL

int varyBufferSizes(long long *values, double **rslts, double **counter, hw_desc_t *hw_desc, long long line_size_in_bytes, float pages_per_block, int pattern, int latency_only, int mode, int ONT);
int varyBufferSizes(long long *values, double **rslts, double **counter, cat_params_t params, hw_desc_t *hw_desc, long long line_size_in_bytes, float pages_per_block, int pattern, int latency_only, int mode, int ONT);
int get_thread_count();
void d_cache_driver(char* papi_event_name, cat_params_t params, hw_desc_t *hw_desc, int latency_only, int mode);
int d_cache_test(int pattern, int max_iter, hw_desc_t *hw_desc, long long stride_in_bytes, float pages_per_block, char* papi_event_name, int latency_only, int mode, FILE* ofp);
int d_cache_test(int pattern, cat_params_t params, hw_desc_t *hw_desc, long long stride_in_bytes, float pages_per_block, char* papi_event_name, int latency_only, int mode, FILE* ofp);

#endif
4 changes: 2 additions & 2 deletions src/counter_analysis_toolkit/main.c
Original file line number Diff line number Diff line change
Expand Up @@ -880,7 +880,7 @@ void testbench(char** allevts, int cmbtotal, hw_desc_t *hw_desc, cat_params_t pa
fflush(stdout);
}
d_cache_driver("cat::latencies", params, hw_desc, 1, 0);
if(params.show_progress) printf("100%%\n");
if(params.show_progress) printf("\n");
}

if(params.show_progress) printf("D-Cache Read Benchmarks: ");
Expand All @@ -907,7 +907,7 @@ void testbench(char** allevts, int cmbtotal, hw_desc_t *hw_desc, cat_params_t pa
fflush(stdout);
}
d_cache_driver("cat::latencies", params, hw_desc, 1, 0);
if(params.show_progress) printf("100%%\n");
if(params.show_progress) printf("\n");
}

if(params.show_progress) printf("D-Cache Write Benchmarks: ");
Expand Down
46 changes: 28 additions & 18 deletions src/counter_analysis_toolkit/timing_kernels.c
Original file line number Diff line number Diff line change
Expand Up @@ -14,13 +14,13 @@ volatile double x,y;
extern int is_core;
char* eventname = NULL;

run_output_t probeBufferSize(long long active_buf_len, long long line_size, float pageCountPerBlock, int pattern, uintptr_t **v, uintptr_t *rslt, int latency_only, int mode, int ONT){
run_output_t probeBufferSize(long long active_buf_len, long long line_size, float pageCountPerBlock, int pattern, long long llc_size, uintptr_t **v, uintptr_t *rslt, int latency_only, int mode, int ONT){
int _papi_eventset = PAPI_NULL;
int retval, buffer = 0, status = 0;
int error_line = -1, error_type = PAPI_OK;
register uintptr_t *p = NULL;
register uintptr_t p_prime;
long long count, pageSize, blockSize;
long long pageSize, blockSize;
long long int counter[ONT];
run_output_t out;
out.status = 0;
Expand All @@ -33,13 +33,15 @@ run_output_t probeBufferSize(long long active_buf_len, long long line_size, floa
if( x > 0 || y > 0 )
printf("WARNING: x=%lf y=%lf\n",x,y);

// Make no fewer accesses than we would for a buffer of size 128KB.
long long countSingle = active_buf_len/line_size;
long long threshold = 1024LL*1024LL/64LL;
long long len = (active_buf_len > threshold) ? active_buf_len : threshold;
long long countMax;
long long unsigned threshold = 128*1024;
if( active_buf_len*sizeof(uintptr_t) > threshold )
countMax = 64LL*((long long)(active_buf_len/line_size));
else
countMax = 64LL*((long long)(threshold/line_size));
if( len > llc_size ){
countMax = 4LL*(len/line_size);
}else{
countMax = 64LL*(len/line_size);
}

// Get the size of a page of memory.
pageSize = sysconf(_SC_PAGESIZE)/sizeof(uintptr_t);
Expand All @@ -51,31 +53,27 @@ run_output_t probeBufferSize(long long active_buf_len, long long line_size, floa

// Compute the size of a block in the pointer chain and create the pointer chain.
blockSize = (long long)(pageCountPerBlock*(float)pageSize);
#pragma omp parallel reduction(+:status) default(shared)
{
int idx = omp_get_thread_num();

status += prepareArray(v[idx], active_buf_len, line_size, blockSize, pattern);
}

// Start of threaded benchmark.
#pragma omp parallel private(p,count,retval) reduction(+:buffer) reduction(+:status) firstprivate(_papi_eventset) default(shared)
#pragma omp parallel private(p,retval) reduction(+:buffer) reduction(+:status) firstprivate(_papi_eventset) default(shared)
{
int idx = omp_get_thread_num();
int thdStatus = 0;
double divisor = 1.0;
double time1=0, time2=0, dt, factor;
long long count;

// Initialize the result to a value indicating an error.
// If no error occurs, it will be overwritten.
if ( !latency_only ) {
out.counter[idx] = -1;
}

status += prepareArray(v[idx], active_buf_len, line_size, blockSize, pattern);

// We will use "p" even after the epilogue, so let's set
// it here in case an error occurs.
p = &v[idx][0];
count = countMax;

if ( !latency_only && (is_core || 0 == idx) ) {
retval = PAPI_create_eventset( &_papi_eventset );
Expand All @@ -95,8 +93,20 @@ run_output_t probeBufferSize(long long active_buf_len, long long line_size, floa
// If we can't measure events, no need to run the kernel.
goto clean_up;
}
}

// Make sure all threads start at about the same time so that we get high pressure on the memory subsystem.
#pragma omp barrier

count = countSingle;
// Make a warm-up pass to fetch the data into the cache, if it fits in any cache.
while(count > 0){
N_128;
count -= 128;
}

// Start the counters.
// Start the counters.
if ( !latency_only && (is_core || 0 == idx) ) {
retval = PAPI_start(_papi_eventset);
if ( PAPI_OK != retval ) {
error_type = retval;
Expand All @@ -108,7 +118,7 @@ run_output_t probeBufferSize(long long active_buf_len, long long line_size, floa
}

// Start the actual test.

count = countMax;
// Micro-kernel for memory reading.
if( CACHE_READ_ONLY == mode || latency_only )
{
Expand Down
2 changes: 1 addition & 1 deletion src/counter_analysis_toolkit/timing_kernels.h
Original file line number Diff line number Diff line change
Expand Up @@ -27,7 +27,7 @@
#define CACHE_READ_ONLY 0x0
#define CACHE_READ_WRITE 0x1

run_output_t probeBufferSize(long long active_buf_len, long long line_size, float pageCountPerBlock, int pattern, uintptr_t **v, uintptr_t *rslt, int detect_size, int mode, int ONT);
run_output_t probeBufferSize(long long active_buf_len, long long line_size, float pageCountPerBlock, int pattern, long long llc_size, uintptr_t **v, uintptr_t *rslt, int detect_size, int mode, int ONT);
void error_handler(int e, int line);

#endif

0 comments on commit 980322d

Please sign in to comment.