Skip to content

Commit

Permalink
[SRC][L1D Cache][VLSU] Fix a bug in VLSU caused by racing between por…
Browse files Browse the repository at this point in the history
…ts when switching from store to load mode. Fix a bug in FFT kernel in final output writing and performance calculation.
  • Loading branch information
DiyouS committed Nov 12, 2024
1 parent 64ce4b4 commit 4e2de3c
Show file tree
Hide file tree
Showing 5 changed files with 33 additions and 36 deletions.
10 changes: 9 additions & 1 deletion hw/ip/spatz/src/spatz_vlsu.sv
Original file line number Diff line number Diff line change
Expand Up @@ -888,7 +888,14 @@ module spatz_vlsu
end
end

// Debug flags
logic [NrMemPorts-1:0] load_flag, store_flag, clear_flag;
// Indicate if each port is in loading or storing mode
logic [NrMemPorts-1:0] port_state_load;
for (genvar port = 0; port < NrMemPorts; port++) begin
assign port_state_load[port] = (port_state_q[port] == VLSU_RunningLoad);
end

// verilator lint_off LATCH
always_comb begin
vrf_raddr_o = {vs2_vreg_addr, vd_vreg_addr};
Expand Down Expand Up @@ -935,7 +942,8 @@ module spatz_vlsu
if (state_q == VLSU_RunningLoad && |commit_operation_valid) begin
// Enable write back to the VRF if we have a valid element in all buffers that still have to write something back.
vrf_req_d.waddr = vd_vreg_addr;
vrf_req_valid_d = &(rob_rvalid | ~mem_pending) && |mem_pending;
// Here, we do not enable the writing to ROB unless all ports are in loading mode (avoid racing in ports)
vrf_req_valid_d = &(rob_rvalid | (~mem_pending && port_state_load)) && |mem_pending;

for (int unsigned port = 0; port < NrMemPorts; port++) begin
automatic logic [63:0] data = rob_rdata[port];
Expand Down
5 changes: 3 additions & 2 deletions sw/spatzBenchmarks/dp-fft-cache/kernel/fft.c
Original file line number Diff line number Diff line change
Expand Up @@ -24,7 +24,8 @@
// todo: simplify the last iteration, which do not require twiddle factors
void fft_sc(double *s, double *buf, const double *twi, const uint16_t *seq_idx,
const uint16_t *rev_idx, const unsigned int nfft,
const unsigned int log2_nfft, const unsigned int cid) {
const unsigned int log2_nfft, const unsigned int cid,
const double *final_store) {

// Always run in dual-core mode
const unsigned int dc = 1;
Expand Down Expand Up @@ -59,7 +60,7 @@ void fft_sc(double *s, double *buf, const double *twi, const uint16_t *seq_idx,

// Last iteration
if (bf == log2_nfft - 1)
o_buf = buf;
o_buf = final_store;

// Update pointers
const double *re_u_i = i_buf;
Expand Down
3 changes: 2 additions & 1 deletion sw/spatzBenchmarks/dp-fft-cache/kernel/fft.h
Original file line number Diff line number Diff line change
Expand Up @@ -26,7 +26,8 @@
inline void fft_sc(double *s, double *buf, const double *twi,
const uint16_t *seq_idx, const uint16_t *rev_idx,
const unsigned int nfft, const unsigned int log2_nfft,
const unsigned int cid) __attribute__((always_inline));
const unsigned int cid, const double *final_store)
__attribute__((always_inline));

// Dual-core
inline void fft_2c(const double *s, double *buf, const double *twi,
Expand Down
42 changes: 12 additions & 30 deletions sw/spatzBenchmarks/dp-fft-cache/main.c
Original file line number Diff line number Diff line change
Expand Up @@ -23,12 +23,7 @@
#include DATAHEADER
#include "kernel/fft.c"

double *samples;
double *buffer;
double *twiddle;

uint16_t *store_idx;
uint16_t *bitrev;
double *out;

static inline int fp_check(const double a, const double b) {
const double threshold = 0.00001;
Expand All @@ -48,7 +43,6 @@ int main() {
if (cid == 0) {
// Init the cache
l1d_init();
l1d_wait();
}
// Wait for all cores to finish
snrt_cluster_hw_barrier();
Expand All @@ -59,26 +53,13 @@ int main() {
// Reset timer
unsigned int timer = (unsigned int)-1;

// Allocate the matrices
// Add a output buffer in SPM for final result store (avoid racing between cores)
if (cid == 0) {
samples = (double *)snrt_l1alloc(2 * NFFT * sizeof(double));
buffer = (double *)snrt_l1alloc(2 * NFFT * sizeof(double));
twiddle = (double *)snrt_l1alloc((2 * NTWI + NFFT) * sizeof(double));
store_idx =
(uint16_t *)snrt_l1alloc(log2_nfft * (NFFT / 4) * sizeof(uint16_t));
bitrev = (uint16_t *)snrt_l1alloc((NFFT / 4) * sizeof(uint16_t));
out = (double *)snrt_l1alloc(2 * NFFT * sizeof(double));
}

// Initialize the matrices
if (cid == 0) {
snrt_dma_start_1d(samples, samples_dram, 2 * NFFT * sizeof(double));
snrt_dma_start_1d(buffer, buffer_dram, 2 * NFFT * sizeof(double));
snrt_dma_start_1d(twiddle, twiddle_dram,
(2 * NTWI + NFFT) * sizeof(double));
snrt_dma_start_1d(store_idx, store_idx_dram,
log2_nfft * (NFFT / 4) * sizeof(uint16_t));
snrt_dma_start_1d(bitrev, bitrev_dram, (NFFT / 4) * sizeof(uint16_t));
snrt_dma_wait_all();
snrt_dma_start_1d(out, buffer_dram, 2 * NFFT * sizeof(double));
}

// Wait for all cores to finish
Expand All @@ -87,6 +68,7 @@ int main() {
// Calculate pointers for the second butterfly onwards
double *s_ = samples_dram + cid * (NFFT >> 1);
double *buf_ = buffer_dram + cid * (NFFT >> 1);
double *out_ = out + cid * (NFFT >> 1);
// double *twi_ = twiddle + NFFT;
double *twi_ = twiddle_dram + NFFT;

Expand All @@ -107,7 +89,7 @@ int main() {
snrt_cluster_hw_barrier();

// Fall back into the single-core case
fft_sc(s_, buf_, twi_, store_idx_dram, bitrev_dram, NFFT >> 1, log2_nfft, cid);
fft_sc(s_, buf_, twi_, store_idx_dram, bitrev_dram, NFFT >> 1, log2_nfft, cid, out_);

// Wait for all cores to finish fft
snrt_cluster_hw_barrier();
Expand All @@ -131,8 +113,8 @@ int main() {
// Display runtime
if (cid == 0) {
long unsigned int performance =
1000 * 10 * NFFT * log2_nfft * 6 / 5 / timer;
long unsigned int utilization = performance / (2 * num_cores * 4);
1000 * 5 * NFFT * (log2_nfft+1) / timer;
long unsigned int utilization = (1000 * performance) / (1250 * num_cores * 4);

printf("\n----- fft on %d samples -----\n", NFFT);
printf("The execution took %u cycles.\n", timer);
Expand All @@ -141,17 +123,17 @@ int main() {

// Verify the real part
for (unsigned int i = 0; i < NFFT; i++) {
if (fp_check(buffer_dram[i], gold_out_dram[2 * i])) {
if (fp_check(out[i], gold_out_dram[2 * i])) {
printf("Error: Index %d -> Result = %f, Expected = %f\n", i,
(float)buffer_dram[i], (float)gold_out_dram[2 * i]);
(float)out[i], (float)gold_out_dram[2 * i]);
}
}

// Verify the imac part
for (unsigned int i = 0; i < NFFT; i++) {
if (fp_check(buffer_dram[i + NFFT], gold_out_dram[2 * i + 1])) {
if (fp_check(out[i + NFFT], gold_out_dram[2 * i + 1])) {
printf("Error: Index %d -> Result = %f, Expected = %f\n", i + NFFT,
(float)buffer_dram[i + NFFT], (float)gold_out_dram[2 * i + 1]);
(float)out[i + NFFT], (float)gold_out_dram[2 * i + 1]);
}
}
}
Expand Down
9 changes: 7 additions & 2 deletions sw/spatzBenchmarks/dp-fft/main.c
Original file line number Diff line number Diff line change
Expand Up @@ -45,6 +45,11 @@ int main() {
const unsigned int num_cores = snrt_cluster_core_num();
const unsigned int cid = snrt_cluster_core_idx();

if (cid == 0) {
// Init the cache
l1d_init();
}

// log2(nfft).
const unsigned int log2_nfft = 31 - __builtin_clz(NFFT >> 1);

Expand Down Expand Up @@ -114,8 +119,8 @@ int main() {
// Display runtime
if (cid == 0) {
long unsigned int performance =
1000 * 10 * NFFT * log2_nfft * 6 / 5 / timer;
long unsigned int utilization = performance / (2 * num_cores * 4);
1000 * 5 * NFFT * (log2_nfft+1) / timer;
long unsigned int utilization = (1000 * performance) / (1250 * num_cores * 4);

printf("\n----- fft on %d samples -----\n", NFFT);
printf("The execution took %u cycles.\n", timer);
Expand Down

0 comments on commit 4e2de3c

Please sign in to comment.