Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

Analytical FIFO sizing #1185

Open
wants to merge 10 commits into
base: dev
Choose a base branch
from
18 changes: 16 additions & 2 deletions src/finn/builder/build_dataflow_config.py
Original file line number Diff line number Diff line change
Expand Up @@ -40,11 +40,16 @@

class AutoFIFOSizingMethod(str, Enum):
"Select the type of automatic FIFO sizing strategy."

CHARACTERIZE = "characterize"
LARGEFIFO_RTLSIM = "largefifo_rtlsim"


class FIFOCharacterizationMethod(str, Enum):
"Select the strategy for characteristic sizing of FIFOs."
CHARACTERIZE_RTLSIM = "rtlsim"
CHARACTERIZE_ANALYTICAL = "analytical"


class ShellFlowType(str, Enum):
"""For builds that produce a bitfile, select the shell flow that will integrate
the FINN-generated accelerator."""
Expand Down Expand Up @@ -116,9 +121,9 @@ class VerificationStepType(str, Enum):
"step_apply_folding_config",
"step_minimize_bit_width",
"step_generate_estimate_reports",
"step_set_fifo_depths",
"step_hw_codegen",
"step_hw_ipgen",
"step_set_fifo_depths",
"step_create_stitched_ip",
"step_measure_rtlsim_performance",
"step_out_of_context_synthesis",
Expand Down Expand Up @@ -273,6 +278,15 @@ class DataflowBuildConfig:
#: setting the FIFO sizes.
auto_fifo_strategy: Optional[AutoFIFOSizingMethod] = AutoFIFOSizingMethod.LARGEFIFO_RTLSIM

#: Which strategy will be used for characteristic function-based FIFO sizing.
#: CHARACTERIZE_RTLSIM will result in performing RTLSIM for each node
#: to deduce the characteristic functions empirically
#: CHARACTERIZE_ANALYTICAL will use analytical functions if available, avoiding the generation
#: of IP cores.
characteristic_function_strategy: Optional[
FIFOCharacterizationMethod
] = FIFOCharacterizationMethod.CHARACTERIZE_RTLSIM

#: Avoid using C++ rtlsim for auto FIFO sizing and rtlsim throughput test
#: if set to True, always using Python instead
force_python_rtlsim: Optional[bool] = False
Expand Down
21 changes: 13 additions & 8 deletions src/finn/builder/build_dataflow_steps.py
Original file line number Diff line number Diff line change
Expand Up @@ -553,14 +553,18 @@ def step_set_fifo_depths(model: ModelWrapper, cfg: DataflowBuildConfig):
model = model.transform(InsertDWC())
model = model.transform(SpecializeLayers(cfg._resolve_fpga_part()))
model = model.transform(GiveUniqueNodeNames())
model = model.transform(AnnotateCycles())

period = int(model.analysis(dataflow_performance)["max_cycles"] * 3 + 10)
model = model.transform(
PrepareIP(cfg._resolve_fpga_part(), cfg._resolve_hls_clk_period())
DeriveCharacteristic(
model,
period,
cfg.characteristic_function_strategy,
cfg._resolve_fpga_part(),
cfg._resolve_hls_clk_period(),
)
)
model = model.transform(HLSSynthIP())
model = model.transform(PrepareRTLSim())
model = model.transform(AnnotateCycles())
period = model.analysis(dataflow_performance)["max_cycles"] + 10
model = model.transform(DeriveCharacteristic(period))
model = model.transform(DeriveFIFOSizes())
model = model.transform(
InsertFIFO(
Expand Down Expand Up @@ -623,6 +627,7 @@ def step_set_fifo_depths(model: ModelWrapper, cfg: DataflowBuildConfig):
"depth_trigger_uram",
"depth_trigger_bram",
]

extract_model_config_to_json(model, cfg.output_dir + "/final_hw_config.json", hw_attrs)

# perform FIFO splitting and shallow FIFO removal only after the final config
Expand All @@ -634,8 +639,8 @@ def step_set_fifo_depths(model: ModelWrapper, cfg: DataflowBuildConfig):

# after FIFOs are ready to go, call PrepareIP and HLSSynthIP again
# this will only run for the new nodes (e.g. FIFOs and DWCs)
model = model.transform(PrepareIP(cfg._resolve_fpga_part(), cfg._resolve_hls_clk_period()))
model = model.transform(HLSSynthIP())
# model = model.transform(PrepareIP(cfg._resolve_fpga_part(), cfg._resolve_hls_clk_period()))
# model = model.transform(HLSSynthIP())
return model


Expand Down
38 changes: 38 additions & 0 deletions src/finn/custom_op/fpgadataflow/channelwise_op.py
Original file line number Diff line number Diff line change
Expand Up @@ -232,3 +232,41 @@ def execute_node(self, context, graph):
sess = rt.InferenceSession(model_func.SerializeToString())
result = sess.run(None, idict)
context[node.output[0]] = np.asarray(result, dtype=np.float32).reshape(oshape)

def prepare_kwargs_for_characteristic_fx(self):
# key parameters
PE = self.get_nodeattr("PE")
NumChannels = self.get_nodeattr("NumChannels")
NF = int(NumChannels / PE)
dim = np.prod(self.get_folded_output_shape()[1:-1])
# assert True == False
kwargs = (NF, dim)

# assert True==False

return kwargs

def characteristic_fx_input(self, txns, cycles, counter, kwargs):
# Compute one period of the input characteristic function

(NF, dim) = kwargs

for k in range(dim):
txns.append(counter)
counter += 1
cycles += 1

#
return txns, cycles, counter

def characteristic_fx_output(self, txns, cycles, counter, kwargs):
# Compute one period of the output characteristic function

(NF, dim) = kwargs

for k in range(dim):
txns.append(counter)
counter += 1
cycles += 1

return txns, cycles, counter
240 changes: 240 additions & 0 deletions src/finn/custom_op/fpgadataflow/convolutioninputgenerator.py
Original file line number Diff line number Diff line change
Expand Up @@ -277,3 +277,243 @@ def execute_node(self, context, graph):
# this automatically updates the execution context
inst = getCustomOp(im2col_node)
inst.execute_node(context, model_im2col.graph)

def prepare_kwargs_for_characteristic_fx(self):
# key parameters
IFMDim_x = self.get_nodeattr("IFMDim")[0]
OFMDim_x = self.get_nodeattr("OFMDim")[0]
ConvKernelDim_x = self.get_nodeattr("ConvKernelDim")[0]
Stride_x = self.get_nodeattr("Stride")[0]

OFMDim_y = self.get_nodeattr("OFMDim")[1]
ConvKernelDim_y = self.get_nodeattr("ConvKernelDim")[1]
Stride_y = self.get_nodeattr("Stride")[1]

SIMD = self.get_nodeattr("SIMD")

IFMChannels = self.get_nodeattr("IFMChannels")

DEPTHWISE = self.get_nodeattr("depthwise")
is1d = self.get_nodeattr("is1D")
# m = self.get_nodeattr("m")
# flip = self.get_nodeattr("flip")

SIMD_COUNT = int(IFMChannels / SIMD)
OUTPUT_SIZE = OFMDim_x * ConvKernelDim_x * SIMD_COUNT
INPUT_SIZE = IFMDim_x * SIMD_COUNT
WINDOW_SIZE = ConvKernelDim_x * SIMD_COUNT
if DEPTHWISE:
BUFFER_SIZE = ConvKernelDim_x * SIMD_COUNT
READ_CYCLES = SIMD_COUNT * (ConvKernelDim_x - 1) - (ConvKernelDim_x - 1)
FINISH = IFMDim_x - ConvKernelDim_x - 2
else:
BUFFER_SIZE = (ConvKernelDim_x - 1) * SIMD_COUNT
READ_CYCLES = 0
FINISH = 0

OCNT_INITIAL = BUFFER_SIZE + (Stride_x - 1)

DEFAULT_FIFO_DEPTH = 2

multiplying_factor = int(IFMChannels / SIMD)
number_blocks = int(ConvKernelDim_y / Stride_y + 1)
cycles_write_block = OFMDim_x * ConvKernelDim_x * ConvKernelDim_y * multiplying_factor
cycles_read_block = Stride_x * IFMDim_x * multiplying_factor
max_cycles = max(cycles_write_block, cycles_read_block)
baseIter = IFMDim_x * ConvKernelDim_y * multiplying_factor + OFMDim_y * max(
cycles_write_block, cycles_read_block
)
initial_buffer = IFMDim_x * ConvKernelDim_y * multiplying_factor

READ_DELAY = (
number_blocks
* ConvKernelDim_x
* ConvKernelDim_y
* OFMDim_x
* OFMDim_y
* multiplying_factor
- ConvKernelDim_x * ConvKernelDim_y * OFMDim_x
)
READ_ITES = int((baseIter - OFMDim_y) / max(cycles_write_block, cycles_read_block))

# assert True == False
kwargs = (
SIMD_COUNT,
Stride_x,
Stride_y,
OUTPUT_SIZE,
INPUT_SIZE,
WINDOW_SIZE,
BUFFER_SIZE,
READ_CYCLES,
OCNT_INITIAL,
DEPTHWISE,
DEFAULT_FIFO_DEPTH,
is1d,
multiplying_factor,
number_blocks,
cycles_write_block,
cycles_read_block,
max_cycles,
baseIter,
initial_buffer,
FINISH,
OFMDim_y,
READ_DELAY,
READ_ITES,
)

# assert True==False

return kwargs

def characteristic_fx_input(self, txns, cycles, counter, kwargs):
# Compute one period of the input characteristic function

(
SIMD_COUNT,
Stride_x,
Stride_y,
OUTPUT_SIZE,
INPUT_SIZE,
WINDOW_SIZE,
BUFFER_SIZE,
READ_CYCLES,
OCNT_INITIAL,
DEPTHWISE,
DEFAULT_FIFO_DEPTH,
is1d,
multiplying_factor,
number_blocks,
cycles_write_block,
cycles_read_block,
max_cycles,
baseIter,
initial_buffer,
FINISH,
OFMDim_y,
READ_DELAY,
READ_ITES,
) = kwargs

if DEPTHWISE:
OCNT_MAX = BUFFER_SIZE
ocnt = SIMD_COUNT

else:
OCNT_MAX = WINDOW_SIZE
if OCNT_INITIAL < WINDOW_SIZE:
ocnt = OCNT_INITIAL
else:
ocnt = -1

# fifo filling
for i in range(0, DEFAULT_FIFO_DEPTH):
txns.append(counter)
counter += 1
cycles += 1

# main function

inp_count = 0

if is1d:
for i in range(0, OUTPUT_SIZE):
txns.append(counter)
we = (i < OCNT_MAX) or (ocnt < (SIMD_COUNT * Stride_x))
re = i > 0

if re:
ocnt += 1
if ocnt == OCNT_MAX:
ocnt = 0
if we:
if inp_count < INPUT_SIZE - DEFAULT_FIFO_DEPTH:
counter += 1
inp_count += 1

cycles += 1
else:
for i in range(0, initial_buffer + cycles_read_block - 1):
txns.append(counter)
cycles += 1
counter += 1

txns.append(counter)
cycles += 1 # one extra for loop tail

for i in range(0, OFMDim_y - 1):
for j in range(0, cycles_write_block - cycles_read_block):
txns.append(counter)
cycles += 1

for j in range(0, cycles_read_block - 1):
if i < OFMDim_y - 2:
counter += 1
txns.append(counter)
cycles += 1
# else:
# if j < FINISH:
# counter+=1
# txns.append(counter)
# cycles+=1
#
return txns, cycles, counter

def characteristic_fx_output(self, txns, cycles, counter, kwargs):
# Compute one period of the output characteristic function

(
SIMD_COUNT,
Stride_x,
Stride_y,
OUTPUT_SIZE,
INPUT_SIZE,
WINDOW_SIZE,
BUFFER_SIZE,
READ_CYCLES,
OCNT_INITIAL,
DEPTHWISE,
DEFAULT_FIFO_DEPTH,
is1d,
multiplying_factor,
number_blocks,
cycles_write_block,
cycles_read_block,
max_cycles,
baseIter,
initial_buffer,
FINISH,
OFMDim_y,
READ_DELAY,
READ_ITES,
) = kwargs

# HYPER PARAMETERS

INITIAL_LOOP_CYCLES = 5

if is1d:
for i in range(0, INITIAL_LOOP_CYCLES):
txns.append(counter)
cycles += 1

for i in range(0, READ_CYCLES):
txns.append(counter)
cycles += 1

for i in range(0, OUTPUT_SIZE):
txns.append(counter)
counter += 1
cycles += 1
else:
for i in range(0, initial_buffer + INITIAL_LOOP_CYCLES - 1):
txns.append(counter)
cycles += 1

for i in range(0, baseIter - initial_buffer):
txns.append(counter)
counter += 1
cycles += 1

return txns, cycles, counter
9 changes: 7 additions & 2 deletions src/finn/custom_op/fpgadataflow/duplicatestreams.py
Copy link
Collaborator

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

DeriveFIFOSizes expects outFIFODepths to be set for every output of every CustomOp. This is problematic in the current implementation of DuplicateStreams, as the default value of [2] (=1 output with FIFO depth set to 2) is inherited from HWCustomOp, even though DuplicateStreams has NumOutputStreams outputs.

How/where do we set the default value of one attribute depending on another attribute without breaking anything? In verify_node()?

Copy link
Author

@lstasytis lstasytis Jan 23, 2025

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

DeriveFIFOSizes should only get called if derive_characteristic_fxns() has been called for each node already, which are kept in the node definition classes. So I would say the optimal place would be in that same function, like such

I'm not sure why this function had 'out1' and 'out2' defined before, was this for a very specific network? I don't think the generalization should break anything.

Original file line number Diff line number Diff line change
Expand Up @@ -166,12 +166,17 @@ def get_verilog_top_module_intf_names(self):
)
return intf_names

def derive_characteristic_fxns(self, period):
def derive_characteristic_fxns(
Copy link
Collaborator

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

This refactoring also needs to be applied to addstreams.py.

self, model, period, strategy, fpga_part, clk_period, op_type, override_dict=None
):
n_inps = np.prod(self.get_folded_input_shape()[:-1])
io_dict = {
"inputs": {
"in0": [0 for i in range(n_inps)],
},
"outputs": {"out0": [], "out1": []},
}
super().derive_characteristic_fxns(period, override_rtlsim_dict=io_dict)

super().derive_characteristic_fxns(
model, period, strategy, fpga_part, clk_period, op_type, override_dict=io_dict
)
Loading