From 4e2de3c27f85aebb855b4267081af0a635f554c5 Mon Sep 17 00:00:00 2001
From: Diyou Shen <dishen@iis.ee.ethz.ch>
Date: Tue, 12 Nov 2024 13:38:53 +0100
Subject: [PATCH] [SRC][L1D Cache][VLSU] Fix a bug in VLSU caused by racing
 between ports when switching from store to load mode. Fix a bug in FFT kernel
 in final output writing and performance calculation.

---
 hw/ip/spatz/src/spatz_vlsu.sv                | 10 ++++-
 sw/spatzBenchmarks/dp-fft-cache/kernel/fft.c |  5 ++-
 sw/spatzBenchmarks/dp-fft-cache/kernel/fft.h |  3 +-
 sw/spatzBenchmarks/dp-fft-cache/main.c       | 42 ++++++--------------
 sw/spatzBenchmarks/dp-fft/main.c             |  9 ++++-
 5 files changed, 33 insertions(+), 36 deletions(-)

diff --git a/hw/ip/spatz/src/spatz_vlsu.sv b/hw/ip/spatz/src/spatz_vlsu.sv
index 1caadfaa..837f50b4 100644
--- a/hw/ip/spatz/src/spatz_vlsu.sv
+++ b/hw/ip/spatz/src/spatz_vlsu.sv
@@ -888,7 +888,14 @@ module spatz_vlsu
     end
   end
 
+  // Debug flags
   logic [NrMemPorts-1:0] load_flag, store_flag, clear_flag;
+  // Indicate if each port is in loading or storing mode
+  logic [NrMemPorts-1:0] port_state_load;
+  for (genvar port = 0; port < NrMemPorts; port++) begin
+    assign port_state_load[port] = (port_state_q[port] == VLSU_RunningLoad);
+  end
+
   // verilator lint_off LATCH
   always_comb begin
     vrf_raddr_o     = {vs2_vreg_addr, vd_vreg_addr};
@@ -935,7 +942,8 @@ module spatz_vlsu
       if (state_q == VLSU_RunningLoad && |commit_operation_valid) begin
         // Enable write back to the VRF if we have a valid element in all buffers that still have to write something back.
         vrf_req_d.waddr = vd_vreg_addr;
-        vrf_req_valid_d = &(rob_rvalid | ~mem_pending) && |mem_pending;
+        // Here, we do not enable the writing to ROB unless all ports are in loading mode (avoid racing in ports)
+        vrf_req_valid_d = &(rob_rvalid | (~mem_pending && port_state_load)) && |mem_pending;
 
         for (int unsigned port = 0; port < NrMemPorts; port++) begin
           automatic logic [63:0] data = rob_rdata[port];
diff --git a/sw/spatzBenchmarks/dp-fft-cache/kernel/fft.c b/sw/spatzBenchmarks/dp-fft-cache/kernel/fft.c
index 116b50b7..39d5daec 100644
--- a/sw/spatzBenchmarks/dp-fft-cache/kernel/fft.c
+++ b/sw/spatzBenchmarks/dp-fft-cache/kernel/fft.c
@@ -24,7 +24,8 @@
 // todo: simplify the last iteration, which do not require twiddle factors
 void fft_sc(double *s, double *buf, const double *twi, const uint16_t *seq_idx,
             const uint16_t *rev_idx, const unsigned int nfft,
-            const unsigned int log2_nfft, const unsigned int cid) {
+            const unsigned int log2_nfft, const unsigned int cid,
+            const double *final_store) {
 
   // Always run in dual-core mode
   const unsigned int dc = 1;
@@ -59,7 +60,7 @@ void fft_sc(double *s, double *buf, const double *twi, const uint16_t *seq_idx,
 
     // Last iteration
     if (bf == log2_nfft - 1)
-      o_buf = buf;
+      o_buf = final_store;
 
     // Update pointers
     const double *re_u_i = i_buf;
diff --git a/sw/spatzBenchmarks/dp-fft-cache/kernel/fft.h b/sw/spatzBenchmarks/dp-fft-cache/kernel/fft.h
index b381bf60..29fc773e 100644
--- a/sw/spatzBenchmarks/dp-fft-cache/kernel/fft.h
+++ b/sw/spatzBenchmarks/dp-fft-cache/kernel/fft.h
@@ -26,7 +26,8 @@
 inline void fft_sc(double *s, double *buf, const double *twi,
                    const uint16_t *seq_idx, const uint16_t *rev_idx,
                    const unsigned int nfft, const unsigned int log2_nfft,
-                   const unsigned int cid) __attribute__((always_inline));
+                   const unsigned int cid, const double *final_store)
+    __attribute__((always_inline));
 
 // Dual-core
 inline void fft_2c(const double *s, double *buf, const double *twi,
diff --git a/sw/spatzBenchmarks/dp-fft-cache/main.c b/sw/spatzBenchmarks/dp-fft-cache/main.c
index 0975d9bd..d1f7ebd8 100644
--- a/sw/spatzBenchmarks/dp-fft-cache/main.c
+++ b/sw/spatzBenchmarks/dp-fft-cache/main.c
@@ -23,12 +23,7 @@
 #include DATAHEADER
 #include "kernel/fft.c"
 
-double *samples;
-double *buffer;
-double *twiddle;
-
-uint16_t *store_idx;
-uint16_t *bitrev;
+double *out;
 
 static inline int fp_check(const double a, const double b) {
   const double threshold = 0.00001;
@@ -48,7 +43,6 @@ int main() {
   if (cid == 0) {
     // Init the cache
     l1d_init();
-    l1d_wait();
   }
   // Wait for all cores to finish
   snrt_cluster_hw_barrier();
@@ -59,26 +53,13 @@ int main() {
   // Reset timer
   unsigned int timer = (unsigned int)-1;
 
-  // Allocate the matrices
+  // Add a output buffer in SPM for final result store (avoid racing between cores)
   if (cid == 0) {
-    samples = (double *)snrt_l1alloc(2 * NFFT * sizeof(double));
-    buffer = (double *)snrt_l1alloc(2 * NFFT * sizeof(double));
-    twiddle = (double *)snrt_l1alloc((2 * NTWI + NFFT) * sizeof(double));
-    store_idx =
-        (uint16_t *)snrt_l1alloc(log2_nfft * (NFFT / 4) * sizeof(uint16_t));
-    bitrev = (uint16_t *)snrt_l1alloc((NFFT / 4) * sizeof(uint16_t));
+    out = (double *)snrt_l1alloc(2 * NFFT * sizeof(double));
   }
 
-  // Initialize the matrices
   if (cid == 0) {
-    snrt_dma_start_1d(samples, samples_dram, 2 * NFFT * sizeof(double));
-    snrt_dma_start_1d(buffer, buffer_dram, 2 * NFFT * sizeof(double));
-    snrt_dma_start_1d(twiddle, twiddle_dram,
-                      (2 * NTWI + NFFT) * sizeof(double));
-    snrt_dma_start_1d(store_idx, store_idx_dram,
-                      log2_nfft * (NFFT / 4) * sizeof(uint16_t));
-    snrt_dma_start_1d(bitrev, bitrev_dram, (NFFT / 4) * sizeof(uint16_t));
-    snrt_dma_wait_all();
+    snrt_dma_start_1d(out, buffer_dram, 2 * NFFT * sizeof(double));
   }
 
   // Wait for all cores to finish
@@ -87,6 +68,7 @@ int main() {
   // Calculate pointers for the second butterfly onwards
   double *s_ = samples_dram + cid * (NFFT >> 1);
   double *buf_ = buffer_dram + cid * (NFFT >> 1);
+  double *out_ = out + cid * (NFFT >> 1);
   // double *twi_ = twiddle + NFFT;
   double *twi_ = twiddle_dram + NFFT;
 
@@ -107,7 +89,7 @@ int main() {
   snrt_cluster_hw_barrier();
 
   // Fall back into the single-core case
-  fft_sc(s_, buf_, twi_, store_idx_dram, bitrev_dram, NFFT >> 1, log2_nfft, cid);
+  fft_sc(s_, buf_, twi_, store_idx_dram, bitrev_dram, NFFT >> 1, log2_nfft, cid, out_);
 
   // Wait for all cores to finish fft
   snrt_cluster_hw_barrier();
@@ -131,8 +113,8 @@ int main() {
   // Display runtime
   if (cid == 0) {
     long unsigned int performance =
-        1000 * 10 * NFFT * log2_nfft * 6 / 5 / timer;
-    long unsigned int utilization = performance / (2 * num_cores * 4);
+        1000 * 5 * NFFT * (log2_nfft+1) / timer;
+    long unsigned int utilization = (1000 * performance) / (1250 * num_cores * 4);
 
     printf("\n----- fft on %d samples -----\n", NFFT);
     printf("The execution took %u cycles.\n", timer);
@@ -141,17 +123,17 @@ int main() {
 
     // Verify the real part
     for (unsigned int i = 0; i < NFFT; i++) {
-      if (fp_check(buffer_dram[i], gold_out_dram[2 * i])) {
+      if (fp_check(out[i], gold_out_dram[2 * i])) {
         printf("Error: Index %d -> Result = %f, Expected = %f\n", i,
-               (float)buffer_dram[i], (float)gold_out_dram[2 * i]);
+               (float)out[i], (float)gold_out_dram[2 * i]);
       }
     }
 
     // Verify the imac part
     for (unsigned int i = 0; i < NFFT; i++) {
-      if (fp_check(buffer_dram[i + NFFT], gold_out_dram[2 * i + 1])) {
+      if (fp_check(out[i + NFFT], gold_out_dram[2 * i + 1])) {
         printf("Error: Index %d -> Result = %f, Expected = %f\n", i + NFFT,
-               (float)buffer_dram[i + NFFT], (float)gold_out_dram[2 * i + 1]);
+               (float)out[i + NFFT], (float)gold_out_dram[2 * i + 1]);
       }
     }
   }
diff --git a/sw/spatzBenchmarks/dp-fft/main.c b/sw/spatzBenchmarks/dp-fft/main.c
index 6bf64908..8de968de 100644
--- a/sw/spatzBenchmarks/dp-fft/main.c
+++ b/sw/spatzBenchmarks/dp-fft/main.c
@@ -45,6 +45,11 @@ int main() {
   const unsigned int num_cores = snrt_cluster_core_num();
   const unsigned int cid = snrt_cluster_core_idx();
 
+  if (cid == 0) {
+    // Init the cache
+    l1d_init();
+  }
+
   // log2(nfft).
   const unsigned int log2_nfft = 31 - __builtin_clz(NFFT >> 1);
 
@@ -114,8 +119,8 @@ int main() {
   // Display runtime
   if (cid == 0) {
     long unsigned int performance =
-        1000 * 10 * NFFT * log2_nfft * 6 / 5 / timer;
-    long unsigned int utilization = performance / (2 * num_cores * 4);
+        1000 * 5 * NFFT * (log2_nfft+1) / timer;
+    long unsigned int utilization = (1000 * performance) / (1250 * num_cores * 4);
 
     printf("\n----- fft on %d samples -----\n", NFFT);
     printf("The execution took %u cycles.\n", timer);