diff --git a/src/counter_analysis_toolkit/dcache.c b/src/counter_analysis_toolkit/dcache.c index 5b70ae9f8..c609a6d65 100644 --- a/src/counter_analysis_toolkit/dcache.c +++ b/src/counter_analysis_toolkit/dcache.c @@ -78,7 +78,7 @@ void d_cache_driver(char* papi_event_name, cat_params_t params, hw_desc_t *hw_de printf("%3d%%\b\b\b\b",(100*test_cnt++)/6); fflush(stdout); } - status = d_cache_test(pattern, params.max_iter, hw_desc, stride, ppb, papi_event_name, latency_only, mode, ofp_papi); + status = d_cache_test(pattern, params, hw_desc, stride, ppb, papi_event_name, latency_only, mode, ofp_papi); if( status < 0 ) goto error2; } @@ -90,7 +90,7 @@ void d_cache_driver(char* papi_event_name, cat_params_t params, hw_desc_t *hw_de printf("%3d%%\b\b\b\b",(100*test_cnt++)/6); fflush(stdout); } - status = d_cache_test(pattern, params.max_iter, hw_desc, stride, ppb, papi_event_name, latency_only, mode, ofp_papi); + status = d_cache_test(pattern, params, hw_desc, stride, ppb, papi_event_name, latency_only, mode, ofp_papi); if( status < 0 ) goto error2; } @@ -115,12 +115,13 @@ void d_cache_driver(char* papi_event_name, cat_params_t params, hw_desc_t *hw_de return; } -int d_cache_test(int pattern, int max_iter, hw_desc_t *hw_desc, long long stride_in_bytes, float pages_per_block, char* papi_event_name, int latency_only, int mode, FILE* ofp){ +int d_cache_test(int pattern, cat_params_t params, hw_desc_t *hw_desc, long long stride_in_bytes, float pages_per_block, char* papi_event_name, int latency_only, int mode, FILE* ofp){ int i,j,k; long long *values; double ***rslts, *sorted_rslts; double ***counter, *sorted_counter; int status=0, guessCount, ONT; + int max_iter = params.max_iter; min_size = 2*1024/sizeof(uintptr_t); // 2KB max_size = 1024*1024*1024/sizeof(uintptr_t);// 1GB @@ -175,7 +176,7 @@ int d_cache_test(int pattern, int max_iter, hw_desc_t *hw_desc, long long stride eventname = papi_event_name; for(i=0; idcache_size[llc_idx]/hw_desc->split[llc_idx]; + llc_size /= sizeof(uintptr_t); + out = probeBufferSize(active_buf_len, stride, pages_per_block, pattern, llc_size, v, &rslt, latency_only, mode, ONT); if(out.status != 0) goto error; for(k = 0; k < ONT; ++k) { @@ -387,6 +395,10 @@ int varyBufferSizes(long long *values, double **rslts, double **counter, hw_desc } values[cnt++] = bufSizes[j]; } + if( params.show_progress ){ + printf(" \b"); + fflush(stdout); + } free(bufSizes); } diff --git a/src/counter_analysis_toolkit/dcache.h b/src/counter_analysis_toolkit/dcache.h index e4ea08929..efb6e915a 100644 --- a/src/counter_analysis_toolkit/dcache.h +++ b/src/counter_analysis_toolkit/dcache.h @@ -8,9 +8,9 @@ #define FACTOR 12LL -int varyBufferSizes(long long *values, double **rslts, double **counter, hw_desc_t *hw_desc, long long line_size_in_bytes, float pages_per_block, int pattern, int latency_only, int mode, int ONT); +int varyBufferSizes(long long *values, double **rslts, double **counter, cat_params_t params, hw_desc_t *hw_desc, long long line_size_in_bytes, float pages_per_block, int pattern, int latency_only, int mode, int ONT); int get_thread_count(); void d_cache_driver(char* papi_event_name, cat_params_t params, hw_desc_t *hw_desc, int latency_only, int mode); -int d_cache_test(int pattern, int max_iter, hw_desc_t *hw_desc, long long stride_in_bytes, float pages_per_block, char* papi_event_name, int latency_only, int mode, FILE* ofp); +int d_cache_test(int pattern, cat_params_t params, hw_desc_t *hw_desc, long long stride_in_bytes, float pages_per_block, char* papi_event_name, int latency_only, int mode, FILE* ofp); #endif diff --git a/src/counter_analysis_toolkit/main.c b/src/counter_analysis_toolkit/main.c index 71b055cf4..85b4f68b2 100644 --- a/src/counter_analysis_toolkit/main.c +++ b/src/counter_analysis_toolkit/main.c @@ -880,7 +880,7 @@ void testbench(char** allevts, int cmbtotal, hw_desc_t *hw_desc, cat_params_t pa fflush(stdout); } d_cache_driver("cat::latencies", params, hw_desc, 1, 0); - if(params.show_progress) printf("100%%\n"); + if(params.show_progress) printf("\n"); } if(params.show_progress) printf("D-Cache Read Benchmarks: "); @@ -907,7 +907,7 @@ void testbench(char** allevts, int cmbtotal, hw_desc_t *hw_desc, cat_params_t pa fflush(stdout); } d_cache_driver("cat::latencies", params, hw_desc, 1, 0); - if(params.show_progress) printf("100%%\n"); + if(params.show_progress) printf("\n"); } if(params.show_progress) printf("D-Cache Write Benchmarks: "); diff --git a/src/counter_analysis_toolkit/timing_kernels.c b/src/counter_analysis_toolkit/timing_kernels.c index 8919ab002..7cef57c07 100644 --- a/src/counter_analysis_toolkit/timing_kernels.c +++ b/src/counter_analysis_toolkit/timing_kernels.c @@ -14,13 +14,13 @@ volatile double x,y; extern int is_core; char* eventname = NULL; -run_output_t probeBufferSize(long long active_buf_len, long long line_size, float pageCountPerBlock, int pattern, uintptr_t **v, uintptr_t *rslt, int latency_only, int mode, int ONT){ +run_output_t probeBufferSize(long long active_buf_len, long long line_size, float pageCountPerBlock, int pattern, long long llc_size, uintptr_t **v, uintptr_t *rslt, int latency_only, int mode, int ONT){ int _papi_eventset = PAPI_NULL; int retval, buffer = 0, status = 0; int error_line = -1, error_type = PAPI_OK; register uintptr_t *p = NULL; register uintptr_t p_prime; - long long count, pageSize, blockSize; + long long pageSize, blockSize; long long int counter[ONT]; run_output_t out; out.status = 0; @@ -33,13 +33,15 @@ run_output_t probeBufferSize(long long active_buf_len, long long line_size, floa if( x > 0 || y > 0 ) printf("WARNING: x=%lf y=%lf\n",x,y); - // Make no fewer accesses than we would for a buffer of size 128KB. + long long countSingle = active_buf_len/line_size; + long long threshold = 1024LL*1024LL/64LL; + long long len = (active_buf_len > threshold) ? active_buf_len : threshold; long long countMax; - long long unsigned threshold = 128*1024; - if( active_buf_len*sizeof(uintptr_t) > threshold ) - countMax = 64LL*((long long)(active_buf_len/line_size)); - else - countMax = 64LL*((long long)(threshold/line_size)); + if( len > llc_size ){ + countMax = 4LL*(len/line_size); + }else{ + countMax = 64LL*(len/line_size); + } // Get the size of a page of memory. pageSize = sysconf(_SC_PAGESIZE)/sizeof(uintptr_t); @@ -51,20 +53,15 @@ run_output_t probeBufferSize(long long active_buf_len, long long line_size, floa // Compute the size of a block in the pointer chain and create the pointer chain. blockSize = (long long)(pageCountPerBlock*(float)pageSize); - #pragma omp parallel reduction(+:status) default(shared) - { - int idx = omp_get_thread_num(); - - status += prepareArray(v[idx], active_buf_len, line_size, blockSize, pattern); - } // Start of threaded benchmark. - #pragma omp parallel private(p,count,retval) reduction(+:buffer) reduction(+:status) firstprivate(_papi_eventset) default(shared) + #pragma omp parallel private(p,retval) reduction(+:buffer) reduction(+:status) firstprivate(_papi_eventset) default(shared) { int idx = omp_get_thread_num(); int thdStatus = 0; double divisor = 1.0; double time1=0, time2=0, dt, factor; + long long count; // Initialize the result to a value indicating an error. // If no error occurs, it will be overwritten. @@ -72,10 +69,11 @@ run_output_t probeBufferSize(long long active_buf_len, long long line_size, floa out.counter[idx] = -1; } + status += prepareArray(v[idx], active_buf_len, line_size, blockSize, pattern); + // We will use "p" even after the epilogue, so let's set // it here in case an error occurs. p = &v[idx][0]; - count = countMax; if ( !latency_only && (is_core || 0 == idx) ) { retval = PAPI_create_eventset( &_papi_eventset ); @@ -95,8 +93,20 @@ run_output_t probeBufferSize(long long active_buf_len, long long line_size, floa // If we can't measure events, no need to run the kernel. goto clean_up; } + } + + // Make sure all threads start at about the same time so that we get high pressure on the memory subsystem. + #pragma omp barrier + + count = countSingle; + // Make a warm-up pass to fetch the data into the cache, if it fits in any cache. + while(count > 0){ + N_128; + count -= 128; + } - // Start the counters. + // Start the counters. + if ( !latency_only && (is_core || 0 == idx) ) { retval = PAPI_start(_papi_eventset); if ( PAPI_OK != retval ) { error_type = retval; @@ -108,7 +118,7 @@ run_output_t probeBufferSize(long long active_buf_len, long long line_size, floa } // Start the actual test. - + count = countMax; // Micro-kernel for memory reading. if( CACHE_READ_ONLY == mode || latency_only ) { diff --git a/src/counter_analysis_toolkit/timing_kernels.h b/src/counter_analysis_toolkit/timing_kernels.h index e2d5f3174..c0c52640a 100644 --- a/src/counter_analysis_toolkit/timing_kernels.h +++ b/src/counter_analysis_toolkit/timing_kernels.h @@ -27,7 +27,7 @@ #define CACHE_READ_ONLY 0x0 #define CACHE_READ_WRITE 0x1 -run_output_t probeBufferSize(long long active_buf_len, long long line_size, float pageCountPerBlock, int pattern, uintptr_t **v, uintptr_t *rslt, int detect_size, int mode, int ONT); +run_output_t probeBufferSize(long long active_buf_len, long long line_size, float pageCountPerBlock, int pattern, long long llc_size, uintptr_t **v, uintptr_t *rslt, int detect_size, int mode, int ONT); void error_handler(int e, int line); #endif