From 4e2de3c27f85aebb855b4267081af0a635f554c5 Mon Sep 17 00:00:00 2001 From: Diyou Shen Date: Tue, 12 Nov 2024 13:38:53 +0100 Subject: [PATCH] [SRC][L1D Cache][VLSU] Fix a bug in VLSU caused by racing between ports when switching from store to load mode. Fix a bug in FFT kernel in final output writing and performance calculation. --- hw/ip/spatz/src/spatz_vlsu.sv | 10 ++++- sw/spatzBenchmarks/dp-fft-cache/kernel/fft.c | 5 ++- sw/spatzBenchmarks/dp-fft-cache/kernel/fft.h | 3 +- sw/spatzBenchmarks/dp-fft-cache/main.c | 42 ++++++-------------- sw/spatzBenchmarks/dp-fft/main.c | 9 ++++- 5 files changed, 33 insertions(+), 36 deletions(-) diff --git a/hw/ip/spatz/src/spatz_vlsu.sv b/hw/ip/spatz/src/spatz_vlsu.sv index 1caadfaa..837f50b4 100644 --- a/hw/ip/spatz/src/spatz_vlsu.sv +++ b/hw/ip/spatz/src/spatz_vlsu.sv @@ -888,7 +888,14 @@ module spatz_vlsu end end + // Debug flags logic [NrMemPorts-1:0] load_flag, store_flag, clear_flag; + // Indicate if each port is in loading or storing mode + logic [NrMemPorts-1:0] port_state_load; + for (genvar port = 0; port < NrMemPorts; port++) begin + assign port_state_load[port] = (port_state_q[port] == VLSU_RunningLoad); + end + // verilator lint_off LATCH always_comb begin vrf_raddr_o = {vs2_vreg_addr, vd_vreg_addr}; @@ -935,7 +942,8 @@ module spatz_vlsu if (state_q == VLSU_RunningLoad && |commit_operation_valid) begin // Enable write back to the VRF if we have a valid element in all buffers that still have to write something back. vrf_req_d.waddr = vd_vreg_addr; - vrf_req_valid_d = &(rob_rvalid | ~mem_pending) && |mem_pending; + // Here, we do not enable the writing to ROB unless all ports are in loading mode (avoid racing in ports) + vrf_req_valid_d = &(rob_rvalid | (~mem_pending && port_state_load)) && |mem_pending; for (int unsigned port = 0; port < NrMemPorts; port++) begin automatic logic [63:0] data = rob_rdata[port]; diff --git a/sw/spatzBenchmarks/dp-fft-cache/kernel/fft.c b/sw/spatzBenchmarks/dp-fft-cache/kernel/fft.c index 116b50b7..39d5daec 100644 --- a/sw/spatzBenchmarks/dp-fft-cache/kernel/fft.c +++ b/sw/spatzBenchmarks/dp-fft-cache/kernel/fft.c @@ -24,7 +24,8 @@ // todo: simplify the last iteration, which do not require twiddle factors void fft_sc(double *s, double *buf, const double *twi, const uint16_t *seq_idx, const uint16_t *rev_idx, const unsigned int nfft, - const unsigned int log2_nfft, const unsigned int cid) { + const unsigned int log2_nfft, const unsigned int cid, + const double *final_store) { // Always run in dual-core mode const unsigned int dc = 1; @@ -59,7 +60,7 @@ void fft_sc(double *s, double *buf, const double *twi, const uint16_t *seq_idx, // Last iteration if (bf == log2_nfft - 1) - o_buf = buf; + o_buf = final_store; // Update pointers const double *re_u_i = i_buf; diff --git a/sw/spatzBenchmarks/dp-fft-cache/kernel/fft.h b/sw/spatzBenchmarks/dp-fft-cache/kernel/fft.h index b381bf60..29fc773e 100644 --- a/sw/spatzBenchmarks/dp-fft-cache/kernel/fft.h +++ b/sw/spatzBenchmarks/dp-fft-cache/kernel/fft.h @@ -26,7 +26,8 @@ inline void fft_sc(double *s, double *buf, const double *twi, const uint16_t *seq_idx, const uint16_t *rev_idx, const unsigned int nfft, const unsigned int log2_nfft, - const unsigned int cid) __attribute__((always_inline)); + const unsigned int cid, const double *final_store) + __attribute__((always_inline)); // Dual-core inline void fft_2c(const double *s, double *buf, const double *twi, diff --git a/sw/spatzBenchmarks/dp-fft-cache/main.c b/sw/spatzBenchmarks/dp-fft-cache/main.c index 0975d9bd..d1f7ebd8 100644 --- a/sw/spatzBenchmarks/dp-fft-cache/main.c +++ b/sw/spatzBenchmarks/dp-fft-cache/main.c @@ -23,12 +23,7 @@ #include DATAHEADER #include "kernel/fft.c" -double *samples; -double *buffer; -double *twiddle; - -uint16_t *store_idx; -uint16_t *bitrev; +double *out; static inline int fp_check(const double a, const double b) { const double threshold = 0.00001; @@ -48,7 +43,6 @@ int main() { if (cid == 0) { // Init the cache l1d_init(); - l1d_wait(); } // Wait for all cores to finish snrt_cluster_hw_barrier(); @@ -59,26 +53,13 @@ int main() { // Reset timer unsigned int timer = (unsigned int)-1; - // Allocate the matrices + // Add a output buffer in SPM for final result store (avoid racing between cores) if (cid == 0) { - samples = (double *)snrt_l1alloc(2 * NFFT * sizeof(double)); - buffer = (double *)snrt_l1alloc(2 * NFFT * sizeof(double)); - twiddle = (double *)snrt_l1alloc((2 * NTWI + NFFT) * sizeof(double)); - store_idx = - (uint16_t *)snrt_l1alloc(log2_nfft * (NFFT / 4) * sizeof(uint16_t)); - bitrev = (uint16_t *)snrt_l1alloc((NFFT / 4) * sizeof(uint16_t)); + out = (double *)snrt_l1alloc(2 * NFFT * sizeof(double)); } - // Initialize the matrices if (cid == 0) { - snrt_dma_start_1d(samples, samples_dram, 2 * NFFT * sizeof(double)); - snrt_dma_start_1d(buffer, buffer_dram, 2 * NFFT * sizeof(double)); - snrt_dma_start_1d(twiddle, twiddle_dram, - (2 * NTWI + NFFT) * sizeof(double)); - snrt_dma_start_1d(store_idx, store_idx_dram, - log2_nfft * (NFFT / 4) * sizeof(uint16_t)); - snrt_dma_start_1d(bitrev, bitrev_dram, (NFFT / 4) * sizeof(uint16_t)); - snrt_dma_wait_all(); + snrt_dma_start_1d(out, buffer_dram, 2 * NFFT * sizeof(double)); } // Wait for all cores to finish @@ -87,6 +68,7 @@ int main() { // Calculate pointers for the second butterfly onwards double *s_ = samples_dram + cid * (NFFT >> 1); double *buf_ = buffer_dram + cid * (NFFT >> 1); + double *out_ = out + cid * (NFFT >> 1); // double *twi_ = twiddle + NFFT; double *twi_ = twiddle_dram + NFFT; @@ -107,7 +89,7 @@ int main() { snrt_cluster_hw_barrier(); // Fall back into the single-core case - fft_sc(s_, buf_, twi_, store_idx_dram, bitrev_dram, NFFT >> 1, log2_nfft, cid); + fft_sc(s_, buf_, twi_, store_idx_dram, bitrev_dram, NFFT >> 1, log2_nfft, cid, out_); // Wait for all cores to finish fft snrt_cluster_hw_barrier(); @@ -131,8 +113,8 @@ int main() { // Display runtime if (cid == 0) { long unsigned int performance = - 1000 * 10 * NFFT * log2_nfft * 6 / 5 / timer; - long unsigned int utilization = performance / (2 * num_cores * 4); + 1000 * 5 * NFFT * (log2_nfft+1) / timer; + long unsigned int utilization = (1000 * performance) / (1250 * num_cores * 4); printf("\n----- fft on %d samples -----\n", NFFT); printf("The execution took %u cycles.\n", timer); @@ -141,17 +123,17 @@ int main() { // Verify the real part for (unsigned int i = 0; i < NFFT; i++) { - if (fp_check(buffer_dram[i], gold_out_dram[2 * i])) { + if (fp_check(out[i], gold_out_dram[2 * i])) { printf("Error: Index %d -> Result = %f, Expected = %f\n", i, - (float)buffer_dram[i], (float)gold_out_dram[2 * i]); + (float)out[i], (float)gold_out_dram[2 * i]); } } // Verify the imac part for (unsigned int i = 0; i < NFFT; i++) { - if (fp_check(buffer_dram[i + NFFT], gold_out_dram[2 * i + 1])) { + if (fp_check(out[i + NFFT], gold_out_dram[2 * i + 1])) { printf("Error: Index %d -> Result = %f, Expected = %f\n", i + NFFT, - (float)buffer_dram[i + NFFT], (float)gold_out_dram[2 * i + 1]); + (float)out[i + NFFT], (float)gold_out_dram[2 * i + 1]); } } } diff --git a/sw/spatzBenchmarks/dp-fft/main.c b/sw/spatzBenchmarks/dp-fft/main.c index 6bf64908..8de968de 100644 --- a/sw/spatzBenchmarks/dp-fft/main.c +++ b/sw/spatzBenchmarks/dp-fft/main.c @@ -45,6 +45,11 @@ int main() { const unsigned int num_cores = snrt_cluster_core_num(); const unsigned int cid = snrt_cluster_core_idx(); + if (cid == 0) { + // Init the cache + l1d_init(); + } + // log2(nfft). const unsigned int log2_nfft = 31 - __builtin_clz(NFFT >> 1); @@ -114,8 +119,8 @@ int main() { // Display runtime if (cid == 0) { long unsigned int performance = - 1000 * 10 * NFFT * log2_nfft * 6 / 5 / timer; - long unsigned int utilization = performance / (2 * num_cores * 4); + 1000 * 5 * NFFT * (log2_nfft+1) / timer; + long unsigned int utilization = (1000 * performance) / (1250 * num_cores * 4); printf("\n----- fft on %d samples -----\n", NFFT); printf("The execution took %u cycles.\n", timer);