Xilinx · lstasytis · Sep 16, 2024 · Oct 2, 2024 · Oct 15, 2024 · Oct 15, 2024
diff --git a/src/finn/builder/build_dataflow_config.py b/src/finn/builder/build_dataflow_config.py
@@ -40,11 +40,16 @@
 
 class AutoFIFOSizingMethod(str, Enum):
     "Select the type of automatic FIFO sizing strategy."
-
     CHARACTERIZE = "characterize"
     LARGEFIFO_RTLSIM = "largefifo_rtlsim"
 
 
+class FIFOCharacterizationMethod(str, Enum):
+    "Select the strategy for characteristic sizing of FIFOs."
+    CHARACTERIZE_RTLSIM = "rtlsim"
+    CHARACTERIZE_ANALYTICAL = "analytical"
+
+
 class ShellFlowType(str, Enum):
     """For builds that produce a bitfile, select the shell flow that will integrate
     the FINN-generated accelerator."""
@@ -116,9 +121,9 @@ class VerificationStepType(str, Enum):
     "step_apply_folding_config",
     "step_minimize_bit_width",
     "step_generate_estimate_reports",
+    "step_set_fifo_depths",
     "step_hw_codegen",
     "step_hw_ipgen",
-    "step_set_fifo_depths",
     "step_create_stitched_ip",
     "step_measure_rtlsim_performance",
     "step_out_of_context_synthesis",
@@ -273,6 +278,15 @@ class DataflowBuildConfig:
     #: setting the FIFO sizes.
     auto_fifo_strategy: Optional[AutoFIFOSizingMethod] = AutoFIFOSizingMethod.LARGEFIFO_RTLSIM
 
+    #: Which strategy will be used for characteristic function-based FIFO sizing.
+    #: CHARACTERIZE_RTLSIM will result in performing RTLSIM for each node
+    #: to deduce the characteristic functions empirically
+    #: CHARACTERIZE_ANALYTICAL will use analytical functions if available, avoiding the generation
+    #: of IP cores.
+    characteristic_function_strategy: Optional[
+        FIFOCharacterizationMethod
+    ] = FIFOCharacterizationMethod.CHARACTERIZE_RTLSIM
+
     #: Avoid using C++ rtlsim for auto FIFO sizing and rtlsim throughput test
     #: if set to True, always using Python instead
     force_python_rtlsim: Optional[bool] = False

diff --git a/src/finn/builder/build_dataflow_steps.py b/src/finn/builder/build_dataflow_steps.py
@@ -553,14 +553,18 @@ def step_set_fifo_depths(model: ModelWrapper, cfg: DataflowBuildConfig):
             model = model.transform(InsertDWC())
             model = model.transform(SpecializeLayers(cfg._resolve_fpga_part()))
             model = model.transform(GiveUniqueNodeNames())
+            model = model.transform(AnnotateCycles())
+
+            period = int(model.analysis(dataflow_performance)["max_cycles"] * 3 + 10)
             model = model.transform(
-                PrepareIP(cfg._resolve_fpga_part(), cfg._resolve_hls_clk_period())
+                DeriveCharacteristic(
+                    model,
+                    period,
+                    cfg.characteristic_function_strategy,
+                    cfg._resolve_fpga_part(),
+                    cfg._resolve_hls_clk_period(),
+                )
             )
-            model = model.transform(HLSSynthIP())
-            model = model.transform(PrepareRTLSim())
-            model = model.transform(AnnotateCycles())
-            period = model.analysis(dataflow_performance)["max_cycles"] + 10
-            model = model.transform(DeriveCharacteristic(period))
             model = model.transform(DeriveFIFOSizes())
             model = model.transform(
                 InsertFIFO(
@@ -623,6 +627,7 @@ def step_set_fifo_depths(model: ModelWrapper, cfg: DataflowBuildConfig):
         "depth_trigger_uram",
         "depth_trigger_bram",
     ]
+
     extract_model_config_to_json(model, cfg.output_dir + "/final_hw_config.json", hw_attrs)
 
     # perform FIFO splitting and shallow FIFO removal only after the final config
@@ -634,8 +639,8 @@ def step_set_fifo_depths(model: ModelWrapper, cfg: DataflowBuildConfig):
 
     # after FIFOs are ready to go, call PrepareIP and HLSSynthIP again
     # this will only run for the new nodes (e.g. FIFOs and DWCs)
-    model = model.transform(PrepareIP(cfg._resolve_fpga_part(), cfg._resolve_hls_clk_period()))
-    model = model.transform(HLSSynthIP())
+    # model = model.transform(PrepareIP(cfg._resolve_fpga_part(), cfg._resolve_hls_clk_period()))
+    # model = model.transform(HLSSynthIP())
     return model
 
 

diff --git a/src/finn/custom_op/fpgadataflow/channelwise_op.py b/src/finn/custom_op/fpgadataflow/channelwise_op.py
@@ -232,3 +232,41 @@ def execute_node(self, context, graph):
         sess = rt.InferenceSession(model_func.SerializeToString())
         result = sess.run(None, idict)
         context[node.output[0]] = np.asarray(result, dtype=np.float32).reshape(oshape)
+
+    def prepare_kwargs_for_characteristic_fx(self):
+        # key parameters
+        PE = self.get_nodeattr("PE")
+        NumChannels = self.get_nodeattr("NumChannels")
+        NF = int(NumChannels / PE)
+        dim = np.prod(self.get_folded_output_shape()[1:-1])
+        # assert True == False
+        kwargs = (NF, dim)
+
+        # assert True==False
+
+        return kwargs
+
+    def characteristic_fx_input(self, txns, cycles, counter, kwargs):
+        # Compute one period of the input characteristic function
+
+        (NF, dim) = kwargs
+
+        for k in range(dim):
+            txns.append(counter)
+            counter += 1
+            cycles += 1
+
+        #
+        return txns, cycles, counter
+
+    def characteristic_fx_output(self, txns, cycles, counter, kwargs):
+        # Compute one period of the output characteristic function
+
+        (NF, dim) = kwargs
+
+        for k in range(dim):
+            txns.append(counter)
+            counter += 1
+            cycles += 1
+
+        return txns, cycles, counter
diff --git a/src/finn/custom_op/fpgadataflow/convolutioninputgenerator.py b/src/finn/custom_op/fpgadataflow/convolutioninputgenerator.py
@@ -277,3 +277,243 @@ def execute_node(self, context, graph):
         # this automatically updates the execution context
         inst = getCustomOp(im2col_node)
         inst.execute_node(context, model_im2col.graph)
+
+    def prepare_kwargs_for_characteristic_fx(self):
+        # key parameters
+        IFMDim_x = self.get_nodeattr("IFMDim")[0]
+        OFMDim_x = self.get_nodeattr("OFMDim")[0]
+        ConvKernelDim_x = self.get_nodeattr("ConvKernelDim")[0]
+        Stride_x = self.get_nodeattr("Stride")[0]
+
+        OFMDim_y = self.get_nodeattr("OFMDim")[1]
+        ConvKernelDim_y = self.get_nodeattr("ConvKernelDim")[1]
+        Stride_y = self.get_nodeattr("Stride")[1]
+
+        SIMD = self.get_nodeattr("SIMD")
+
+        IFMChannels = self.get_nodeattr("IFMChannels")
+
+        DEPTHWISE = self.get_nodeattr("depthwise")
+        is1d = self.get_nodeattr("is1D")
+        # m = self.get_nodeattr("m")
+        # flip = self.get_nodeattr("flip")
+
+        SIMD_COUNT = int(IFMChannels / SIMD)
+        OUTPUT_SIZE = OFMDim_x * ConvKernelDim_x * SIMD_COUNT
+        INPUT_SIZE = IFMDim_x * SIMD_COUNT
+        WINDOW_SIZE = ConvKernelDim_x * SIMD_COUNT
+        if DEPTHWISE:
+            BUFFER_SIZE = ConvKernelDim_x * SIMD_COUNT
+            READ_CYCLES = SIMD_COUNT * (ConvKernelDim_x - 1) - (ConvKernelDim_x - 1)
+            FINISH = IFMDim_x - ConvKernelDim_x - 2
+        else:
+            BUFFER_SIZE = (ConvKernelDim_x - 1) * SIMD_COUNT
+            READ_CYCLES = 0
+            FINISH = 0
+
+        OCNT_INITIAL = BUFFER_SIZE + (Stride_x - 1)
+
+        DEFAULT_FIFO_DEPTH = 2
+
+        multiplying_factor = int(IFMChannels / SIMD)
+        number_blocks = int(ConvKernelDim_y / Stride_y + 1)
+        cycles_write_block = OFMDim_x * ConvKernelDim_x * ConvKernelDim_y * multiplying_factor
+        cycles_read_block = Stride_x * IFMDim_x * multiplying_factor
+        max_cycles = max(cycles_write_block, cycles_read_block)
+        baseIter = IFMDim_x * ConvKernelDim_y * multiplying_factor + OFMDim_y * max(
+            cycles_write_block, cycles_read_block
+        )
+        initial_buffer = IFMDim_x * ConvKernelDim_y * multiplying_factor
+
+        READ_DELAY = (
+            number_blocks
+            * ConvKernelDim_x
+            * ConvKernelDim_y
+            * OFMDim_x
+            * OFMDim_y
+            * multiplying_factor
+            - ConvKernelDim_x * ConvKernelDim_y * OFMDim_x
+        )
+        READ_ITES = int((baseIter - OFMDim_y) / max(cycles_write_block, cycles_read_block))
+
+        # assert True == False
+        kwargs = (
+            SIMD_COUNT,
+            Stride_x,
+            Stride_y,
+            OUTPUT_SIZE,
+            INPUT_SIZE,
+            WINDOW_SIZE,
+            BUFFER_SIZE,
+            READ_CYCLES,
+            OCNT_INITIAL,
+            DEPTHWISE,
+            DEFAULT_FIFO_DEPTH,
+            is1d,
+            multiplying_factor,
+            number_blocks,
+            cycles_write_block,
+            cycles_read_block,
+            max_cycles,
+            baseIter,
+            initial_buffer,
+            FINISH,
+            OFMDim_y,
+            READ_DELAY,
+            READ_ITES,
+        )
+
+        # assert True==False
+
+        return kwargs
+
+    def characteristic_fx_input(self, txns, cycles, counter, kwargs):
+        # Compute one period of the input characteristic function
+
+        (
+            SIMD_COUNT,
+            Stride_x,
+            Stride_y,
+            OUTPUT_SIZE,
+            INPUT_SIZE,
+            WINDOW_SIZE,
+            BUFFER_SIZE,
+            READ_CYCLES,
+            OCNT_INITIAL,
+            DEPTHWISE,
+            DEFAULT_FIFO_DEPTH,
+            is1d,
+            multiplying_factor,
+            number_blocks,
+            cycles_write_block,
+            cycles_read_block,
+            max_cycles,
+            baseIter,
+            initial_buffer,
+            FINISH,
+            OFMDim_y,
+            READ_DELAY,
+            READ_ITES,
+        ) = kwargs
+
+        if DEPTHWISE:
+            OCNT_MAX = BUFFER_SIZE
+            ocnt = SIMD_COUNT
+
+        else:
+            OCNT_MAX = WINDOW_SIZE
+            if OCNT_INITIAL < WINDOW_SIZE:
+                ocnt = OCNT_INITIAL
+            else:
+                ocnt = -1
+
+        # fifo filling
+        for i in range(0, DEFAULT_FIFO_DEPTH):
+            txns.append(counter)
+            counter += 1
+            cycles += 1
+
+        # main function
+
+        inp_count = 0
+
+        if is1d:
+            for i in range(0, OUTPUT_SIZE):
+                txns.append(counter)
+                we = (i < OCNT_MAX) or (ocnt < (SIMD_COUNT * Stride_x))
+                re = i > 0
+
+                if re:
+                    ocnt += 1
+                    if ocnt == OCNT_MAX:
+                        ocnt = 0
+                if we:
+                    if inp_count < INPUT_SIZE - DEFAULT_FIFO_DEPTH:
+                        counter += 1
+                        inp_count += 1
+
+                cycles += 1
+        else:
+            for i in range(0, initial_buffer + cycles_read_block - 1):
+                txns.append(counter)
+                cycles += 1
+                counter += 1
+
+            txns.append(counter)
+            cycles += 1  # one  extra for loop tail
+
+            for i in range(0, OFMDim_y - 1):
+                for j in range(0, cycles_write_block - cycles_read_block):
+                    txns.append(counter)
+                    cycles += 1
+
+                for j in range(0, cycles_read_block - 1):
+                    if i < OFMDim_y - 2:
+                        counter += 1
+                        txns.append(counter)
+                        cycles += 1
+                #   else:
+                #   if j < FINISH:
+                #        counter+=1
+                #        txns.append(counter)
+                #       cycles+=1
+        #
+        return txns, cycles, counter
+
+    def characteristic_fx_output(self, txns, cycles, counter, kwargs):
+        # Compute one period of the output characteristic function
+
+        (
+            SIMD_COUNT,
+            Stride_x,
+            Stride_y,
+            OUTPUT_SIZE,
+            INPUT_SIZE,
+            WINDOW_SIZE,
+            BUFFER_SIZE,
+            READ_CYCLES,
+            OCNT_INITIAL,
+            DEPTHWISE,
+            DEFAULT_FIFO_DEPTH,
+            is1d,
+            multiplying_factor,
+            number_blocks,
+            cycles_write_block,
+            cycles_read_block,
+            max_cycles,
+            baseIter,
+            initial_buffer,
+            FINISH,
+            OFMDim_y,
+            READ_DELAY,
+            READ_ITES,
+        ) = kwargs
+
+        # HYPER PARAMETERS
+
+        INITIAL_LOOP_CYCLES = 5
+
+        if is1d:
+            for i in range(0, INITIAL_LOOP_CYCLES):
+                txns.append(counter)
+                cycles += 1
+
+            for i in range(0, READ_CYCLES):
+                txns.append(counter)
+                cycles += 1
+
+            for i in range(0, OUTPUT_SIZE):
+                txns.append(counter)
+                counter += 1
+                cycles += 1
+        else:
+            for i in range(0, initial_buffer + INITIAL_LOOP_CYCLES - 1):
+                txns.append(counter)
+                cycles += 1
+
+            for i in range(0, baseIter - initial_buffer):
+                txns.append(counter)
+                counter += 1
+                cycles += 1
+
+        return txns, cycles, counter
diff --git a/src/finn/custom_op/fpgadataflow/duplicatestreams.py b/src/finn/custom_op/fpgadataflow/duplicatestreams.py
@@ -166,12 +166,17 @@ def get_verilog_top_module_intf_names(self):
             )
         return intf_names
 
-    def derive_characteristic_fxns(self, period):
+    def derive_characteristic_fxns(
+        self, model, period, strategy, fpga_part, clk_period, op_type, override_dict=None
+    ):
         n_inps = np.prod(self.get_folded_input_shape()[:-1])
         io_dict = {
             "inputs": {
                 "in0": [0 for i in range(n_inps)],
             },
             "outputs": {"out0": [], "out1": []},
         }
-        super().derive_characteristic_fxns(period, override_rtlsim_dict=io_dict)
+
+        super().derive_characteristic_fxns(
+            model, period, strategy, fpga_part, clk_period, op_type, override_dict=io_dict
+        )