From 52cfc4a2ac4c9feb729ad7acd2adbfb0e1a41207 Mon Sep 17 00:00:00 2001
From: Christoph Berganski <christoph.berganski@gmail.com>
Date: Wed, 13 Mar 2024 10:17:08 +0100
Subject: [PATCH 01/51] Fix clipping range issue in RoundAndClipThresholds
 transformation

---
 src/finn/transformation/streamline/round_thresholds.py | 4 ++--
 1 file changed, 2 insertions(+), 2 deletions(-)

diff --git a/src/finn/transformation/streamline/round_thresholds.py b/src/finn/transformation/streamline/round_thresholds.py
index 5ba5ee0ff5..2bf3630cff 100644
--- a/src/finn/transformation/streamline/round_thresholds.py
+++ b/src/finn/transformation/streamline/round_thresholds.py
@@ -57,10 +57,10 @@ def apply(self, model):
                     model.set_tensor_datatype(n.input[1], idtype)
                     graph_modified = True
                 if idtype.is_integer() and (
-                    (Tnew < (idtype.min() - 1)).any() or (Tnew > (idtype.max() + 1)).any()
+                    (Tnew < (idtype.min())).any() or (Tnew > (idtype.max())).any()
                 ):
                     # clip any large thresholds to input range + 1
-                    Tnew = np.clip(Tnew, idtype.min() - 1, idtype.max() + 1)
+                    Tnew = np.clip(Tnew, idtype.min(), idtype.max())
                     model.set_initializer(n.input[1], Tnew)
                     # use same datatype as inputs for thresholds
                     model.set_tensor_datatype(n.input[1], idtype)

From c8292e2a27bebb2254f278e409b00f448c35e600 Mon Sep 17 00:00:00 2001
From: Christoph Berganski <christoph.berganski@gmail.com>
Date: Sat, 6 Apr 2024 17:06:03 +0200
Subject: [PATCH 02/51] Rework RoundAndClipThresholds to avoid range and type
 promotion issues

See https://github.com/Xilinx/finn/issues/978
---
 .../streamline/round_thresholds.py            | 105 +++++++++++++-----
 1 file changed, 76 insertions(+), 29 deletions(-)

diff --git a/src/finn/transformation/streamline/round_thresholds.py b/src/finn/transformation/streamline/round_thresholds.py
index 2bf3630cff..2666242730 100644
--- a/src/finn/transformation/streamline/round_thresholds.py
+++ b/src/finn/transformation/streamline/round_thresholds.py
@@ -26,43 +26,90 @@
 # OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
 # OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
 
+# Need numpy for modifying the onnx graph tensors, which are numpy style arrays
 import numpy as np
+
+# QONNX wrapper of ONNX model graphs
+from qonnx.core.modelwrapper import ModelWrapper
+
+# QONNX graph transformation base class
 from qonnx.transformation.base import Transformation
 
+# Transformation running qonnx datatype inference
+from qonnx.transformation.infer_datatypes import InferDataTypes
+
 
+# Rounds and clips thresholds to integer values if the node inputs are integer,
+# respecting range, representability and data type (promotion) of the container
+# data type
 class RoundAndClipThresholds(Transformation):
     """For MultiThreshold nodes operating on integer inputs, round up
     thresholds values to the nearest integer. Additionally, if the input
-    is unsigned, sets negative thresholds to zero."""
+    is unsigned, sets negative thresholds to zero. Type-casts thresholds (back)
+    to the float32 container type (this is separate from the quantization
+    annotation). Runs InferDataTypes() afterward to propagate any changes to the
+    quantization data types."""
 
-    def apply(self, model):
+    # Applies the transform to a whole model graph
+    def apply(self, model: ModelWrapper):  # noqa
+        # Get the model graph out of the model wrapper object
         graph = model.graph
+        # Keep track of whether the graph has been modified
         graph_modified = False
-        for n in graph.node:
-            if n.op_type == "MultiThreshold":
-                idtype = model.get_tensor_datatype(n.input[0])
-                T = model.get_initializer(n.input[1])
-                Tnew = np.ceil(T)
-                if idtype.is_integer() and (T != Tnew).any():
-                    # round up the thresholds to nearest integer
-                    model.set_initializer(n.input[1], Tnew)
-                    # use same datatype as inputs for thresholds
-                    model.set_tensor_datatype(n.input[1], idtype)
-                    graph_modified = True
-                if idtype.is_integer() and not idtype.signed() and (Tnew < 0).any():
-                    # clip any negative thresholds if input is unsigned
-                    Tnew = np.clip(Tnew, 0, None)
-                    model.set_initializer(n.input[1], Tnew)
-                    # use same datatype as inputs for thresholds
-                    model.set_tensor_datatype(n.input[1], idtype)
-                    graph_modified = True
-                if idtype.is_integer() and (
-                    (Tnew < (idtype.min())).any() or (Tnew > (idtype.max())).any()
-                ):
-                    # clip any large thresholds to input range + 1
-                    Tnew = np.clip(Tnew, idtype.min(), idtype.max())
-                    model.set_initializer(n.input[1], Tnew)
-                    # use same datatype as inputs for thresholds
-                    model.set_tensor_datatype(n.input[1], idtype)
+        # Iterate all nodes in the graph keeping track of the index
+        for index, node in enumerate(graph.node):
+            # Applies to initializer tensors of MultiThreshold operations
+            if node.op_type == "MultiThreshold":
+                # Try to get the thresholds initializer tensor
+                thresholds = model.get_initializer(node.input[1])
+                # There might be no constant thresholds stored as initializer
+                # tensor inside the model
+                if thresholds is None:
+                    # Nothing we can do, skip to the next node
+                    continue
+                # Get the data type of the inputs to this operation
+                dtype = model.get_tensor_datatype(node.input[0])
+                # This transformation only applies to thresholding operations
+                # operating on integer inputs
+                if not dtype.is_integer():
+                    # Nothing we can do, skip to the next node
+                    continue
+                # Round thresholds up to nearest integer and clip thresholds
+                # outside the input range
+                #   Note: This might promote the thresholds to float64 and
+                #   introduce extra inaccuracies due to large integers not being
+                #   exactly representable in floating-point representation.
+                #   See for example: np.ceil(np.float32(16777217)) == 16777216
+                # fmt: off
+                new_thresholds = np.clip(
+                    np.ceil(thresholds), dtype.min(), dtype.max()
+                )
+                # fmt: on
+                # Convert back to the preferred float32 container type
+                #   Note: np.clip might have promoted the thresholds to float64
+                #   TODO: Maybe consider an int64 container type for thresholds
+                #    rounded to integer? Need to check all other transformations
+                #    and code generation through the whole FINN and QONNX stack
+                #    first, as these probably assume a float32 container type.
+                new_thresholds = new_thresholds.astype(np.float32)
+                # Insert the rounded and clipped thresholds back into the model
+                model.set_initializer(node.input[1], new_thresholds)
+                # The rounded and clipped thresholds now fit into the input data
+                # type
+                model.set_tensor_datatype(node.input[1], dtype)
+                # Test whether the new thresholds actually differ from the old
+                # ones
+                if np.any(new_thresholds != thresholds):
+                    # Track the graph has been modified to inform the transform
+                    # container to exhaustively repeat this transformation until
+                    # no changes are possible
                     graph_modified = True
-        return (model, graph_modified)
+                    # Immediately exit here to propagate the data type changes
+                    # before considering the next node
+                    break
+        # Some data types might have changed, do one pass of data type inference
+        # to propagate these changes through the graph
+        model = model.transform(InferDataTypes())
+        # Return the transformed model and indicate whether the graph actually
+        # has been transformed to exhaustively apply this transformation again.
+        return model, graph_modified

From 3109645cb2a2bb764bd982948a36e2788756efc1 Mon Sep 17 00:00:00 2001
From: Christoph Berganski <christoph.berganski@gmail.com>
Date: Sat, 6 Apr 2024 17:10:36 +0200
Subject: [PATCH 03/51] [Tests] Rework test-cases for reworked
 RoundAndClipThresholds

See https://github.com/Xilinx/finn/issues/978
---
 .../streamline/test_round_thresholds.py       | 257 ++++++++++++++++--
 1 file changed, 227 insertions(+), 30 deletions(-)

diff --git a/tests/transformation/streamline/test_round_thresholds.py b/tests/transformation/streamline/test_round_thresholds.py
index 85c60b37d5..63375598a0 100644
--- a/tests/transformation/streamline/test_round_thresholds.py
+++ b/tests/transformation/streamline/test_round_thresholds.py
@@ -26,45 +26,242 @@
 # OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
 # OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
 
+# fmt: off
+# Disable formatter. This is deliberately formatted to stay within 80 characters
+# per line. Black, however, formats some lines going beyond this.
+
+# Testing framework
 import pytest
 
+# Use numpy for python execution / computing the ground truth expected values
 import numpy as np
+
+# Utility types and function for creating onnx nodes and graphs
 from onnx import TensorProto, helper
+
+# QONNX data types like INT25
 from qonnx.core.datatype import DataType
+
+# QONNX wrapper of ONNX model graphs
 from qonnx.core.modelwrapper import ModelWrapper
-from qonnx.util.basic import qonnx_make_model
 
+# Generate random tensors of QONNX/FINN data types for testing
+from qonnx.util.basic import gen_finn_dt_tensor
+
+# Execution of onnx graphs within FINN
 import finn.core.onnx_exec as oxe
+
+# The transformation to be tested
 from finn.transformation.streamline import RoundAndClipThresholds
 
 
-@pytest.mark.streamline
-def test_round_thresholds():
-    v = helper.make_tensor_value_info("v", TensorProto.FLOAT, [1, 4])
-    thresholds = helper.make_tensor_value_info("thresholds", TensorProto.FLOAT, [4, 1])
-    out = helper.make_tensor_value_info("out", TensorProto.FLOAT, [1, 4])
-    node_def = helper.make_node(
-        "MultiThreshold", ["v", "thresholds"], ["out"], domain="qonnx.custom_op.general"
+# Tests the RoundAndClipThresholds transformation under various input, output
+# data type combinations with purely integer inputs. Without proper rounding,
+# this tests only the clipping, range and type-casting behavior of the
+# transformation.
+@pytest.mark.parametrize("i_dtype", [
+    # Explanation for selecting these test configurations:
+    # 1. Below 24-bit thresholds we will not observe any interesting rounding
+    #    behavior, as all integers < 2^24 can be exactly represented in 32-bit
+    #    floating-point. Thus, we test thresholds at 25-bit signed integers and
+    #    generate test inputs slightly above and below this.
+    # 2. We want to test out-of-range clipping of thresholds, in particular
+    #    clipping of the negative portion of signed thresholds. Thus, we only
+    #    generate signed thresholds, but test with signed and unsigned
+    #    inputs of smaller, larger and equal range.
+    # 3. Testing proper floating-point thresholds requires a separate test-case
+    "INT23", "UINT23", "INT24", "UINT24", "INT25", "UINT25", "INT26", "UINT26"
+])
+@pytest.mark.parametrize("o_dtype", [
+    # Explanation for selecting these test configurations:
+    # 1. Outputs of MultiThreshold are typically much smaller bit-width than the
+    #    inputs and thresholds.
+    # 2. However, with randomly samples thresholds from a rather large range due
+    #    to the selected input bit-widths (see above), we risk not adequately
+    #    covering the input range if we sample too few thresholds. The number of
+    #    thresholds sampled depends on the bit-width of the output, thus we use
+    #    rather high bit-width for testing.
+    # 3. For a "real" model, the quantization procedure *should* take care of
+    #    adequately covering the true input range.
+    "INT8", "UINT8"
+])
+@pytest.mark.parametrize("n_elems", [
+    # Explanation for selecting these test configurations:
+    # 1. Small edge cases and quickly running through tests: 1, 2, 3, 4
+    # 2. Large test case 256, hopefully amplifying any rarely occurring errors
+    1, 2, 3, 4, 256
+])
+def test_round_and_clip_thresholds_ints(i_dtype, o_dtype, n_elems):
+    # Convert string representation of data type to onnx DataType
+    i_dtype = DataType[i_dtype]
+    t_dtype = DataType["INT25"]  # Note: Matches configuration above
+    o_dtype = DataType[o_dtype]  # noqa: Duplicate model setup code
+    # Create a dummy MultiThreshold operation to be tested
+    node = helper.make_node(
+        # Op-Type of the node
+        "MultiThreshold",
+        # MultiThreshold is implemented under the qonnx domain
+        domain="qonnx.custom_op.general",
+        # List the names of the input tensors
+        inputs=["inp", "thresholds"],
+        # List the names of the output tensors
+        outputs=["out"],
+        # The CustomOp needs to know the data type of the output to be produced
+        out_dtype=str(o_dtype)
+    )
+    # Number of threshold values required to produce outputs of type o_dtype
+    n_thresholds = o_dtype.get_num_possible_values() - 1
+    # Create tensor value infos for all input/output tensors involved
+    inp = helper.make_tensor_value_info("inp", TensorProto.FLOAT, [1, n_elems])
+    out = helper.make_tensor_value_info("out", TensorProto.FLOAT, [1, n_elems])
+    # Create a tensor value info for the thresholds parameter tensor
+    #   Note: Number of thresholds is determined by the output data type
+    thresholds = helper.make_tensor_value_info(
+        "thresholds", TensorProto.FLOAT, [n_elems, n_thresholds]
+    )
+    # Combine node and tensor value infos into an onnx graph
+    graph = helper.make_graph([node], "thresholds", [inp, thresholds], [out])
+    # Wrap the model graph in a ModelWrapper container
+    model = ModelWrapper(helper.make_model(graph))
+    # Sample random tensors of the configured input data type
+    inp = gen_finn_dt_tensor(i_dtype, [1, n_elems])
+    # Generate sorted thresholds for each of the input channels
+    thresholds = np.sort(gen_finn_dt_tensor(t_dtype, [n_elems, n_thresholds]))
+    # Set data type annotations for the input and thresholds tensor
+    model.set_tensor_datatype("inp", i_dtype)  # noqa: Duplicate model execution
+    model.set_tensor_datatype("thresholds", t_dtype)
+    model.set_tensor_datatype("out", o_dtype)
+    # Set the thresholds as initializer input to the model
+    model.set_initializer("thresholds", thresholds)
+    # Execute the model before running the RoundAndClipThresholds transformation
+    out_expected = oxe.execute_onnx(model, {"inp": inp})["out"]
+    # Before rounding the threshold data type must be as annotated
+    assert model.get_tensor_datatype("thresholds") == t_dtype
+    # Run the transformation to be tested
+    model = model.transform(RoundAndClipThresholds())
+    # After this transformation, the thresholds and output data type should be
+    # inferred correctly
+    assert model.get_tensor_datatype("thresholds") == i_dtype
+    assert model.get_tensor_datatype("out") == o_dtype
+    # After this transformation, the container type used to store the thresholds
+    # values must be float32. No other type-cast or type promotion may happen.
+    assert model.get_initializer("thresholds").dtype == np.float32
+    # After rounding, all thresholds must be integers represented as float32
+    assert all(
+        x.is_integer() for x in model.get_initializer("thresholds").flatten()
+    )
+    # Execute the model after running the RoundAndClipThresholds transformation
+    out_produced = oxe.execute_onnx(model, {"inp": inp})["out"]
+    # Compare the results before and after: This is the pure integer test-case
+    # and no actual rounding should happen, thus the rounded operation should
+    # produce outputs exactly equal.
+    assert np.all(out_produced == out_expected)
+
+
+# Tests the RoundAndClipThresholds transformation under various input, output
+# data type combinations with purely integer inputs. This test case tests actual
+# rounding of floating-point thresholds.
+@pytest.mark.parametrize("i_dtype", [
+    # Explanation for selecting these test configurations:
+    # 1. Below 24-bit thresholds we will not observe any interesting rounding
+    #    behavior, as all integers < 2^24 can be exactly represented in 32-bit
+    #    floating-point. Thus, we test thresholds at 25-bit signed integers and
+    #    generate test inputs slightly above and below this.
+    # 2. We want to test out-of-range clipping of thresholds, in particular
+    #    clipping of the negative portion of signed thresholds. Thus, we only
+    #    generate signed thresholds, but test with signed and unsigned
+    #    inputs of smaller, larger and equal range.
+    # 3. Testing proper floating-point thresholds requires a separate test-case
+    "INT23", "UINT23", "INT24", "UINT24", "INT25", "UINT25", "INT26", "UINT26"
+])
+@pytest.mark.parametrize("o_dtype", [
+    # Explanation for selecting these test configurations:
+    # 1. Outputs of MultiThreshold are typically much smaller bit-width than the
+    #    inputs and thresholds.
+    # 2. However, with randomly samples thresholds from a rather large range due
+    #    to the selected input bit-widths (see above), we risk not adequately
+    #    covering the input range if we sample too few thresholds. The number of
+    #    thresholds sampled depends on the bit-width of the output, thus we use
+    #    rather high bit-width for testing.
+    # 3. For a "real" model, the quantization procedure *should* take care of
+    #    adequately covering the true input range.
+    "INT8", "UINT8"
+])
+@pytest.mark.parametrize("n_elems", [
+    # Explanation for selecting these test configurations:
+    # 1. Small edge cases and quickly running through tests: 1, 2, 3, 4
+    # 2. Large test case 256, hopefully amplifying any rarely occurring errors
+    1, 2, 3, 4, 256
+])
+def test_round_and_clip_thresholds_floats(i_dtype, o_dtype, n_elems):
+    # Convert string representation of data type to onnx DataType
+    i_dtype = DataType[i_dtype]
+    t_dtype = DataType["FLOAT32"]
+    o_dtype = DataType[o_dtype]  # noqa: Duplicate model setup code
+    # Create a dummy MultiThreshold operation to be tested
+    node = helper.make_node(
+        # Op-Type of the node
+        "MultiThreshold",
+        # MultiThreshold is implemented under the qonnx domain
+        domain="qonnx.custom_op.general",
+        # List the names of the input tensors
+        inputs=["inp", "thresholds"],
+        # List the names of the output tensors
+        outputs=["out"],
+        # The CustomOp needs to know the data type of the output to be produced
+        out_dtype=str(o_dtype)
+    )
+    # Number of threshold values required to produce outputs of type o_dtype
+    n_thresholds = o_dtype.get_num_possible_values() - 1
+    # Create tensor value infos for all input/output tensors involved
+    inp = helper.make_tensor_value_info("inp", TensorProto.FLOAT, [1, n_elems])
+    out = helper.make_tensor_value_info("out", TensorProto.FLOAT, [1, n_elems])
+    # Create a tensor value info for the thresholds parameter tensor
+    #   Note: Number of thresholds is determined by the output data type
+    thresholds = helper.make_tensor_value_info(
+        "thresholds", TensorProto.FLOAT, [n_elems, n_thresholds]
+    )
+    # Combine node and tensor value infos into an onnx graph
+    graph = helper.make_graph([node], "thresholds", [inp, thresholds], [out])
+    # Wrap the model graph in a ModelWrapper container
+    model = ModelWrapper(helper.make_model(graph))
+    # Sample random tensors of the configured input data type
+    inp = gen_finn_dt_tensor(i_dtype, [1, n_elems])
+    # Draw uniformly random prototype thresholds in [0,+1] range
+    thresholds = np.random.rand(n_elems, n_thresholds)
+    # Type alias to 25-bit signed integer type used to set the range of the
+    # thresholds
+    INT25 = DataType["INT25"]  # noqa: Variable name not lowercase
+    # Map the prototype thresholds into the test integer range and sort
+    thresholds = np.sort((INT25.max() - INT25.min()) * thresholds + INT25.min())
+    # Set data type annotations for the input and thresholds tensor
+    model.set_tensor_datatype("inp", i_dtype)  # noqa: Duplicate model execution
+    model.set_tensor_datatype("thresholds", t_dtype)
+    model.set_tensor_datatype("out", o_dtype)
+    # Set the thresholds as initializer input to the model
+    model.set_initializer("thresholds", thresholds)
+    # Execute the model before running the RoundAndClipThresholds transformation
+    out_expected = oxe.execute_onnx(model, {"inp": inp})["out"]
+    # Before rounding the threshold data type must be as annotated
+    assert model.get_tensor_datatype("thresholds") == t_dtype
+    # Run the transformation to be tested
+    model = model.transform(RoundAndClipThresholds())
+    # After this transformation, the thresholds and output data type should be
+    # inferred correctly
+    assert model.get_tensor_datatype("thresholds") == i_dtype
+    assert model.get_tensor_datatype("out") == o_dtype
+    # After this transformation, the container type used to store the thresholds
+    # values must be float32. No other type-cast or type promotion may happen.
+    assert model.get_initializer("thresholds").dtype == np.float32
+    # After rounding, all thresholds must be integers represented as float32
+    assert all(
+        x.is_integer() for x in model.get_initializer("thresholds").flatten()
     )
-    graph_def = helper.make_graph([node_def], "test_model", [v, thresholds], [out])
-    model_def = qonnx_make_model(graph_def)
-    model = ModelWrapper(model_def)
-    threshold_val = np.asarray([[-1.1], [0.7], [2.3], [5.1]], dtype=np.float32)
-    model.set_initializer("thresholds", threshold_val)
-    model.set_tensor_datatype("v", DataType["INT8"])
-    inp_dict_f = {"v": np.floor(threshold_val).T}
-    inp_dict_n = {"v": np.round(threshold_val).T}
-    inp_dict_c = {"v": np.ceil(threshold_val).T}
-    orig_f = oxe.execute_onnx(model, inp_dict_f)["out"]
-    orig_n = oxe.execute_onnx(model, inp_dict_n)["out"]
-    orig_c = oxe.execute_onnx(model, inp_dict_c)["out"]
-    assert model.get_tensor_datatype("thresholds") == DataType["FLOAT32"]
-    new_model = model.transform(RoundAndClipThresholds())
-    # rounded up thresholds should have same dtype as input
-    assert new_model.get_tensor_datatype("thresholds") == DataType["INT8"]
-    new_f = oxe.execute_onnx(new_model, inp_dict_f)["out"]
-    new_n = oxe.execute_onnx(new_model, inp_dict_n)["out"]
-    new_c = oxe.execute_onnx(new_model, inp_dict_c)["out"]
-    assert np.isclose(orig_f, new_f, atol=1e-3).all()
-    assert np.isclose(orig_n, new_n, atol=1e-3).all()
-    assert np.isclose(orig_c, new_c, atol=1e-3).all()
+    # Execute the model after running the RoundAndClipThresholds transformation
+    out_produced = oxe.execute_onnx(model, {"inp": inp})["out"]
+    # Compare the results before and after: This is the floating-point test with
+    # actual rounding, this the transformed result may only be equal within some
+    # tolerance.
+    # Hm, never observed this to be relevant. For all test configurations, exact
+    # equality seems to hold, probably due to only integer inputs being tested.
+    assert np.allclose(out_produced, out_expected, atol=1.0e-3)

From 1b2665b7947cd4a1ded9459bdcf515de485fa518 Mon Sep 17 00:00:00 2001
From: lstasytis <l.stasytis1@gmail.com>
Date: Wed, 17 Jul 2024 13:05:42 +0100
Subject: [PATCH 04/51] small typo fix

---
 notebooks/advanced/2_custom_op.ipynb                 | 2 +-
 notebooks/advanced/4_advanced_builder_settings.ipynb | 2 +-
 2 files changed, 2 insertions(+), 2 deletions(-)

diff --git a/notebooks/advanced/2_custom_op.ipynb b/notebooks/advanced/2_custom_op.ipynb
index bdd2976412..4c80c0263b 100644
--- a/notebooks/advanced/2_custom_op.ipynb
+++ b/notebooks/advanced/2_custom_op.ipynb
@@ -649,7 +649,7 @@
    "metadata": {},
    "outputs": [],
    "source": [
-    "# run with FINN's execute_onnx, custom node will use c++ execution\n",
+    "# run with FINN's execute_onnx, custom node will use C++ execution\n",
     "new_op_inst.set_nodeattr(\"exec_mode\", \"c++\")\n",
     "ret = execute_onnx(mixedop_graph_new, inp_dict)\n",
     "ret"
diff --git a/notebooks/advanced/4_advanced_builder_settings.ipynb b/notebooks/advanced/4_advanced_builder_settings.ipynb
index 5139377342..e0c326d7d5 100644
--- a/notebooks/advanced/4_advanced_builder_settings.ipynb
+++ b/notebooks/advanced/4_advanced_builder_settings.ipynb
@@ -1278,7 +1278,7 @@
    "id": "f7012b9a",
    "metadata": {},
    "source": [
-    "In this section, we will have a peak into additional builder arguments the FINN compiler exposes. We will not be able to cover all but you will be able to have a look at a list and we encourage you to take your time to look into the different options there are to customize the FINN builder configuration."
+    "In this section, we will have a peek into additional builder arguments the FINN compiler exposes. We will not be able to cover all but you will be able to have a look at a list and we encourage you to take your time to look into the different options there are to customize the FINN builder configuration."
    ]
   },
   {

From 10fa30b3a47080310a749cbf7914fec3942b4e4e Mon Sep 17 00:00:00 2001
From: lstasytis <l.stasytis1@gmail.com>
Date: Wed, 17 Jul 2024 15:32:04 +0100
Subject: [PATCH 05/51] more typos and some rewording

---
 .../advanced/4_advanced_builder_settings.ipynb   | 16 ++++++++--------
 .../1-train-mlp-with-brevitas.ipynb              |  2 +-
 2 files changed, 9 insertions(+), 9 deletions(-)

diff --git a/notebooks/advanced/4_advanced_builder_settings.ipynb b/notebooks/advanced/4_advanced_builder_settings.ipynb
index e0c326d7d5..4a0f2bc695 100644
--- a/notebooks/advanced/4_advanced_builder_settings.ipynb
+++ b/notebooks/advanced/4_advanced_builder_settings.ipynb
@@ -199,7 +199,7 @@
    "id": "d746eff3",
    "metadata": {},
    "source": [
-    "After each FINN builder step, the graph is saved as .onnx file. In the cell above we sort the intermediate models by time in descending order (`ls -t -r`) to visualize the builder flow. As you can see after the conversion to the FINN-ONNX format (`step_qonnx_to_finn`), the graph is prepared by tidy up and streamlining (`step_tidy_up` and `step_streamline`) and then the high level nodes are converted to HW abstraction layers (`step_convert_to_hw`). Then there is a partition created from all layers that were converted to HW layers (`step_create_dataflow_partition`), then we convert each of the HW abstraction layers into an HLS or RTL variant (`step_specialize_layers`). Afterwards optimizations are applied (`step_target_fps_parallelization`, `step_apply_folding_config` and `step_minimize_bit_width`). In the final step of this example we generate resource and performance reports for the network (`step_generate_estimate_reports`). Use the code below to investigate the network after each step."
+    "After each FINN builder step, the graph is saved as an .onnx file. In the cell above we sort the intermediate models by time in descending order (`ls -t -r`) to visualize the builder flow. As you can see after the conversion to the FINN-ONNX format (`step_qonnx_to_finn`), the graph is prepared by tidy up and streamlining (`step_tidy_up` and `step_streamline`) and then the high level nodes are converted to HW abstraction layers (`step_convert_to_hw`). Then there is a partition created from all layers that were converted to HW layers (`step_create_dataflow_partition`), then we convert each of the HW abstraction layers into an HLS or RTL variant (`step_specialize_layers`). Afterwards optimizations are applied (`step_target_fps_parallelization`, `step_apply_folding_config` and `step_minimize_bit_width`). In the final step of this example we generate resource and performance reports for the network (`step_generate_estimate_reports`). Use the code below to investigate the network after each step."
    ]
   },
   {
@@ -218,7 +218,7 @@
    "id": "bccebd0d",
    "metadata": {},
    "source": [
-    "The analysis of these .onnx files can help us identifying points in the flow in which we might need to intervene and provide the compiler with additional information. When investigating the network after the conversion to HW layers, we can see that there are layers that were not converted. We can see this by clicking on the different nodes. HW layers have the module `finn.custom_op.fpgadataflow`."
+    "The analysis of these .onnx files can help us identify points in the flow in which we might need to intervene and provide the compiler with additional information. When investigating the network after the conversion to HW layers, we can see that there are layers that were not converted. We can see this by clicking on the different nodes. HW layers have the module `finn.custom_op.fpgadataflow`."
    ]
   },
   {
@@ -361,7 +361,7 @@
    "id": "2809f6a7",
    "metadata": {},
    "source": [
-    "Each steps gets the model (`model: ModelWrapper`) and the build configuration (`cfg: DataflowBuildConfig`) as input arguments. Then a certain sequence of transformations is applied to the model. In some of the steps, verification can be run to ensure that the applied transformations have not changed the behaviour of the network. In the end the modified model is returned."
+    "Each steps gets the model (`model: ModelWrapper`) and the build configuration (`cfg: DataflowBuildConfig`) as input arguments. Then a certain sequence of transformations is applied to the model. In some of the steps, verification can be run to ensure that the applied transformations have not changed the behaviour of the network. In the end, the modified model is returned."
    ]
   },
   {
@@ -993,7 +993,7 @@
    "id": "fd1519fe",
    "metadata": {},
    "source": [
-    "In the following part of the tutorial, we will use the auto generated json file as starting point to create two new json files which explore the `ram_style` attribute. We will use one of the generated reports from the FINN builder to see the impact of these changes.\n",
+    "In the following part of the tutorial, we will use the auto generated json file as a starting point to create two new json files which explore the `ram_style` attribute. We will use one of the generated reports from the FINN builder to see the impact of these changes.\n",
     "For that, we will extract the total resources from the *estimate_layer_resources.json* report in the following cell."
    ]
   },
@@ -1254,7 +1254,7 @@
    "id": "97f87780",
    "metadata": {},
    "source": [
-    "The initial implementation already had a high utilization of BRAM, but the estimations went now up to ~500 BRAMs while the LUT count went down to ~99k."
+    "The initial implementation already had a high utilization of BRAM, but the estimations now went up to ~500 BRAMs while the LUT count went down to ~99k."
    ]
   },
   {
@@ -1302,7 +1302,7 @@
    "id": "308d52ba",
    "metadata": {},
    "source": [
-    "Earlier in the tutorial, we had a look at how build steps are written. When investigating the `step_tidy_up`, we can see that before the changed model is returned a verification step can be run. In the case of `step_tidy_up` it is the step `\"initial python\"` that can be initiated by setting `VerificationStepType.TIDY_UP_PYTHON`."
+    "Earlier in the tutorial, we had a look at how build steps are written. When investigating the `step_tidy_up`, we can see that before the changed model is returned, a verification step can be run. In the case of `step_tidy_up` it is the step `\"initial python\"` that can be initiated by setting `VerificationStepType.TIDY_UP_PYTHON`."
    ]
   },
   {
@@ -1536,7 +1536,7 @@
    "source": [
     "There are attributes that come from the dataclasses-json class: `to_dict`, `to_json`, `schema`, `from_json`, `from_dict`. This class is used for the implementation of the FINN builder. In this tutorial, we are mainly interested in the FINN specific arguments.  \n",
     "\n",
-    "Some of these arguments we have seen already in the Cybersecurity notebook and in this notebook, e.g. `target_fps`, `fpga_part` and `folding_config_file`. In the code of the FINN builder, the function of each builder argument is documents, you can have a look [here](https://github.com/Xilinx/finn/blob/dev/src/finn/builder/build_dataflow_config.py#L155) and scroll through the available builder arguments."
+    "Some of these arguments we have seen already in the Cybersecurity notebook and in this notebook, e.g. `target_fps`, `fpga_part` and `folding_config_file`. In the code of the FINN builder, the function of each builder argument is documented, you can have a look [here](https://github.com/Xilinx/finn/blob/dev/src/finn/builder/build_dataflow_config.py#L155) and scroll through the available builder arguments."
    ]
   },
   {
@@ -1602,7 +1602,7 @@
    "id": "c249f141",
    "metadata": {},
    "source": [
-    "This concludes the advanced builder settings tutorial. Below you can find code that can help you investigating more of the builder arguments and invoking the whole flow to generate a bitfile."
+    "This concludes the advanced builder settings tutorial. Below you can find code that can help you in investigating more of the builder arguments and invoking the whole flow to generate a bitfile."
    ]
   },
   {
diff --git a/notebooks/end2end_example/cybersecurity/1-train-mlp-with-brevitas.ipynb b/notebooks/end2end_example/cybersecurity/1-train-mlp-with-brevitas.ipynb
index da037050bb..e2bece5777 100644
--- a/notebooks/end2end_example/cybersecurity/1-train-mlp-with-brevitas.ipynb
+++ b/notebooks/end2end_example/cybersecurity/1-train-mlp-with-brevitas.ipynb
@@ -194,7 +194,7 @@
    "source": [
     "# Define a PyTorch Device <a id='define_pytorch_device'></a> \n",
     "\n",
-    "GPUs can significantly speed-up training of deep neural networks. We check for availability of a GPU and if so define it as target device."
+    "GPUs can significantly speed-up training of deep neural networks. We check for availability of a GPU and if so define it as the target device."
    ]
   },
   {

From f723c0cf34a06239ee12736b815c8c4e01d45c00 Mon Sep 17 00:00:00 2001
From: lstasytis <l.stasytis1@gmail.com>
Date: Tue, 30 Jul 2024 09:52:14 +0100
Subject: [PATCH 06/51] updating cybersecurity example, first pass

---
 .../1-train-mlp-with-brevitas.ipynb           |  4 +-
 .../2-import-into-finn-and-verify.ipynb       | 97 ++++++++++++++++++-
 2 files changed, 93 insertions(+), 8 deletions(-)

diff --git a/notebooks/end2end_example/cybersecurity/1-train-mlp-with-brevitas.ipynb b/notebooks/end2end_example/cybersecurity/1-train-mlp-with-brevitas.ipynb
index e2bece5777..3f7a9b1070 100644
--- a/notebooks/end2end_example/cybersecurity/1-train-mlp-with-brevitas.ipynb
+++ b/notebooks/end2end_example/cybersecurity/1-train-mlp-with-brevitas.ipynb
@@ -709,9 +709,7 @@
     "\n",
     "# ModelWrapper\n",
     "model = ModelWrapper(ready_model_filename)\n",
-    "# Setting the input datatype explicitly because it doesn't get derived from the export function\n",
-    "model.set_tensor_datatype(model.graph.input[0].name, DataType[\"BIPOLAR\"])\n",
-    "model = model.transform(ConvertQONNXtoFINN())\n",
+    "\n",
     "model.save(ready_model_filename)\n",
     "\n",
     "print(\"Model saved to %s\" % ready_model_filename)"
diff --git a/notebooks/end2end_example/cybersecurity/2-import-into-finn-and-verify.ipynb b/notebooks/end2end_example/cybersecurity/2-import-into-finn-and-verify.ipynb
index 33b64e11c0..d2fda2e830 100644
--- a/notebooks/end2end_example/cybersecurity/2-import-into-finn-and-verify.ipynb
+++ b/notebooks/end2end_example/cybersecurity/2-import-into-finn-and-verify.ipynb
@@ -51,9 +51,12 @@
    "cell_type": "markdown",
    "metadata": {},
    "source": [
-    "# 1. Import model into FINN with ModelWrapper <a id=\"brevitas_import_visualization\"></a>\n",
+    "# 1. Import model into FINN with ModelWrapper and ConvertQONNXtoFINN <a id=\"brevitas_import_visualization\"></a>\n",
     "\n",
-    "Now that we have the model in .onnx format, we can work with it using FINN. To import it into FINN, we'll use the [`ModelWrapper`](https://finn.readthedocs.io/en/latest/source_code/finn.core.html#qonnx.core.modelwrapper.ModelWrapper). It is a wrapper around the ONNX model which provides several helper functions to make it easier to work with the model."
+    "\n",
+    "## 1.1 Using ModelWrapper to load and observe a model\n",
+    "We first load the model which we prepared in the last notebook by using the\n",
+    "[`ModelWrapper`](https://finn.readthedocs.io/en/latest/source_code/finn.core.html#qonnx.core.modelwrapper.ModelWrapper). It is a wrapper around the ONNX model which provides several helper functions to make it easier to work with the model."
    ]
   },
   {
@@ -64,17 +67,23 @@
    "source": [
     "import os\n",
     "from qonnx.core.modelwrapper import ModelWrapper\n",
+    "from qonnx.core.datatype import DataType\n",
+    "from finn.transformation.qonnx.convert_qonnx_to_finn import ConvertQONNXtoFINN\n",
     "\n",
     "model_dir = os.environ['FINN_ROOT'] + \"/notebooks/end2end_example/cybersecurity\"\n",
     "ready_model_filename = model_dir + \"/cybsec-mlp-ready.onnx\"\n",
-    "model_for_sim = ModelWrapper(ready_model_filename)"
+    "\n",
+    "# ModelWrapper\n",
+    "model_for_sim = ModelWrapper(ready_model_filename)\n",
+    "\n",
+    "print(\"Model loaded from %s\" % ready_model_filename)"
    ]
   },
   {
    "cell_type": "markdown",
    "metadata": {},
    "source": [
-    "Let's have a look at some of the member functions exposed by `ModelWrapper` to see what kind of information we can extract from it."
+    "Now that we have the model in .onnx format, we can look at some of the operations that were introduced, however we cannot use it in FINN just yet. To import it into FINN, we will need to use the ConvertQONNXtoFINN transformation. But before that, let us use some of the member functions exposed by `ModelWrapper` to see what kind of information we can extract from it and have a baseline to compare to when we do call the ConvertQONNXtoFINN transformation."
    ]
   },
   {
@@ -121,7 +130,85 @@
    "cell_type": "markdown",
    "metadata": {},
    "source": [
-    "Note that the output tensor is (as of yet) marked as a float32 value, even though we know the output is binary. This will be automatically inferred by the compiler in the next step when we run the `InferDataTypes` transformation."
+    "Note that the output tensor is (as of yet) marked as a float32 value, even though we know the output is binary. This will get resolved when we call the `ConvertQONNXtoFINN` transformation, which internally features an `Infer_Data_Types` transformation."
+   ]
+  },
+  {
+   "cell_type": "markdown",
+   "metadata": {},
+   "source": [
+    "## 1.2 Converting to from QONNX to FINN using ConvertQONNXtoFINN\n",
+    "\n",
+    "At this point, we would like to move from the QONNX intermediate representation (IR) onto the FINN IR. We can do this by using the ConvertQONNXtoFINN() function from FINN on a QONNX model."
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": null,
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "\n",
+    "# Setting the input datatype explicitly because it doesn't get derived from the export function\n",
+    "model_for_sim.set_tensor_datatype(model_for_sim.graph.input[0].name, DataType[\"BIPOLAR\"])\n",
+    "\n",
+    "# Calling the actual QONNX -> FINN transformation\n",
+    "model_for_sim = model_for_sim.transform(ConvertQONNXtoFINN())"
+   ]
+  },
+  {
+   "cell_type": "markdown",
+   "metadata": {},
+   "source": [
+    "We can look at the tensor datatypes and operator types again to see how they have changed."
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": null,
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "from qonnx.core.datatype import DataType\n",
+    "\n",
+    "finnonnx_in_tensor_name = model_for_sim.graph.input[0].name\n",
+    "finnonnx_out_tensor_name = model_for_sim.graph.output[0].name\n",
+    "print(\"Input tensor name: %s\" % finnonnx_in_tensor_name)\n",
+    "print(\"Output tensor name: %s\" % finnonnx_out_tensor_name)\n",
+    "finnonnx_model_in_shape = model_for_sim.get_tensor_shape(finnonnx_in_tensor_name)\n",
+    "finnonnx_model_out_shape = model_for_sim.get_tensor_shape(finnonnx_out_tensor_name)\n",
+    "print(\"Input tensor shape: %s\" % str(finnonnx_model_in_shape))\n",
+    "print(\"Output tensor shape: %s\" % str(finnonnx_model_out_shape))\n",
+    "finnonnx_model_in_dt = model_for_sim.get_tensor_datatype(finnonnx_in_tensor_name)\n",
+    "finnonnx_model_out_dt = model_for_sim.get_tensor_datatype(finnonnx_out_tensor_name)\n",
+    "print(\"Input tensor datatype: %s\" % str(finnonnx_model_in_dt.name))\n",
+    "print(\"Output tensor datatype: %s\" % str(finnonnx_model_out_dt.name))\n",
+    "print(\"List of node operator types in the graph: \")\n",
+    "print([x.op_type for x in model_for_sim.graph.node])"
+   ]
+  },
+  {
+   "cell_type": "markdown",
+   "metadata": {},
+   "source": [
+    "Notice that the input and output tensor datatypes now correctly show `BIPOLAR` while the operator types have also heavily changed compared to the QONNX version. This is because in FINN, we use operators more suitable for FPGA implementations. `ConvertQONNXtoFINN` internally called many transformations which change the operators in such a manner and we can actually peek at the source code to see them using the `showSrc` function."
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": null,
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "from finn.util.visualization import showSrc\n",
+    "showSrc(ConvertQONNXtoFINN.apply)"
+   ]
+  },
+  {
+   "cell_type": "markdown",
+   "metadata": {},
+   "source": [
+    "As we can see, `ConvertQONNXtoFINN`  turned GEMM operation into MatMuls, turned ReLU nodes into Thresholding nodes and so forth. However, these nodes do need further transformations before they can be turned into FPGA operators, which we handle in the next step."
    ]
   },
   {

From e19f8b6715cc9871c465c4f8b098a1cdf55bf272 Mon Sep 17 00:00:00 2001
From: lstasytis <l.stasytis1@gmail.com>
Date: Tue, 6 Aug 2024 16:20:10 +0100
Subject: [PATCH 07/51] ONNX to QONNX and similar changes, updated text

---
 .../1-train-mlp-with-brevitas.ipynb           |  28 +++--
 .../2-import-into-finn-and-verify.ipynb       | 111 ++++++++++--------
 2 files changed, 74 insertions(+), 65 deletions(-)

diff --git a/notebooks/end2end_example/cybersecurity/1-train-mlp-with-brevitas.ipynb b/notebooks/end2end_example/cybersecurity/1-train-mlp-with-brevitas.ipynb
index 3f7a9b1070..73bb009e2d 100644
--- a/notebooks/end2end_example/cybersecurity/1-train-mlp-with-brevitas.ipynb
+++ b/notebooks/end2end_example/cybersecurity/1-train-mlp-with-brevitas.ipynb
@@ -672,7 +672,7 @@
     "\n",
     "[ONNX](https://onnx.ai/) is an open format built to represent machine learning models, and the FINN compiler expects an ONNX model as input. We'll now export our network into ONNX to be imported and used in FINN for the next notebooks. Note that the particular ONNX representation used for FINN differs from standard ONNX, you can read more about this [here](https://finn.readthedocs.io/en/latest/internals.html#intermediate-representation-finn-onnx).\n",
     "\n",
-    "You can see below how we export a trained network in Brevitas into a FINN-compatible ONNX representation (QONNX). QONNX is the format we can export from Brevitas, to feed it into the FINN compiler, we will need to make a conversion to the FINN-ONNX format which is the intermediate representation the compiler works on. The conversion of the FINN-ONNX format is a FINN compiler transformation and to be able to apply it to our model, we will need to wrap it into [ModelWrapper](https://finn.readthedocs.io/en/latest/internals.html#modelwrapper). This is a wrapper around the ONNX model which provides several helper functions to make it easier to work with the model. Then we can call the conversion function to obtain the model in FINN-ONNX format."
+    "You can see below how we export a trained network in Brevitas into a FINN-compatible ONNX representation (QONNX). QONNX is the format we can export from Brevitas, to feed it into the FINN compiler, we will need to make a conversion to the FINN-ONNX format which is the intermediate representation the compiler works on. The conversion of the FINN-ONNX format is a FINN compiler transformation and to be able to apply it to our model, we will need to wrap it into [ModelWrapper](https://finn.readthedocs.io/en/latest/internals.html#modelwrapper). This is a wrapper around the ONNX model which provides several helper functions to make it easier to work with the model. Then we can call the conversion function to obtain the model in FINN-ONNX format. This will be done in the next notebook. For now, we simply export and save the QONNX model."
    ]
   },
   {
@@ -707,11 +707,6 @@
     "# clean-up\n",
     "qonnx_cleanup(ready_model_filename, out_file=ready_model_filename)\n",
     "\n",
-    "# ModelWrapper\n",
-    "model = ModelWrapper(ready_model_filename)\n",
-    "\n",
-    "model.save(ready_model_filename)\n",
-    "\n",
     "print(\"Model saved to %s\" % ready_model_filename)"
    ]
   },
@@ -719,16 +714,16 @@
    "cell_type": "markdown",
    "metadata": {},
    "source": [
-    "## View the Exported ONNX in Netron\n",
+    "## View the Exported QONNX in Netron\n",
     "\n",
-    "Let's examine the exported ONNX model with [Netron](https://github.com/lutzroeder/netron), which is a visualizer for neural networks and allows interactive investigation of network properties. For example, you can click on the individual nodes and view the properties. Particular things of note:\n",
+    "Let's examine the exported QONNX model with [Netron](https://github.com/lutzroeder/netron), which is a visualizer for neural networks and allows interactive investigation of network properties. For example, you can click on the individual nodes and view the properties. Particular things of note:\n",
     "\n",
-    "* The input tensor \"0\" is annotated with `quantization: finn_datatype: BIPOLAR`\n",
     "* The input preprocessing (x + 1) / 2 is exported as part of the network (initial `Add` and `Div` layers)\n",
-    "* Brevitas `QuantLinear` layers are exported to ONNX as `MatMul`. We've exported the padded version; shape of the first MatMul node's weight parameter is 600x64\n",
-    "* The weight parameters (second inputs) for MatMul nodes are annotated with `quantization: finn_datatype: INT2`\n",
-    "* The quantized activations are exported as `MultiThreshold` nodes with `domain=qonnx.custom_op.general`\n",
-    "* There's a final `MultiThreshold` node with threshold=0 to produce the final bipolar output (this is the `qnt_output` from `CybSecMLPForExport`"
+    "* Brevitas `QuantLinear` layers are exported to QONNX as `Gemm`. We've exported the padded version; shape of the first `Gemm` node's weight parameter is 600x64\n",
+    "* The quantized activations are exported as `Quant` nodes with `domain=qonnx.custom_op.general`\n",
+    "* The weight parameters (second inputs) for the `Gemm` node can also be viewed by opening up the producer `Quant` node, scrolling down to the `Inputs` section and pressing the plus sign to the right of the first input parameter. For the first `Quant` node, this would be the parameter named `Quant_0_param0`\n",
+    "* The bitwidth of the weights are also shown as the 4th value in the `Quant` node, (3=2) meaning that we quantize to 2 bits total.\n",
+    "* There's a final `BipolarQuant` node with a single input and output value to produce the final bipolar output (this is the `qnt_output` from `CybSecMLPForExport`)"
    ]
   },
   {
@@ -749,6 +744,13 @@
     "## That's it! <a id=\"thats_it\" ></a>\n",
     "You created, trained and tested a quantized MLP that is ready to be loaded into FINN, congratulations! You can now proceed to the next notebook."
    ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": null,
+   "metadata": {},
+   "outputs": [],
+   "source": []
   }
  ],
  "metadata": {
diff --git a/notebooks/end2end_example/cybersecurity/2-import-into-finn-and-verify.ipynb b/notebooks/end2end_example/cybersecurity/2-import-into-finn-and-verify.ipynb
index d2fda2e830..522a25f5c7 100644
--- a/notebooks/end2end_example/cybersecurity/2-import-into-finn-and-verify.ipynb
+++ b/notebooks/end2end_example/cybersecurity/2-import-into-finn-and-verify.ipynb
@@ -12,7 +12,7 @@
     "\n",
     "**Also remember to 'close and halt' any other FINN notebooks, since Netron visualizations use the same port.**\n",
     "\n",
-    "In this notebook we will show how to import the network we trained in Brevitas and verify it in the FINN compiler. \n",
+    "In this notebook we will show how to import the network we trained in Brevitas, convert it from the QONNX format to FINN-ONNX, going over the differences and, lastly, verify it in the FINN compiler. \n",
     "This verification process can actually be done at various stages in the compiler [as explained in this notebook](../bnn-pynq/tfc_end2end_verification.ipynb) but for this example we'll only consider the first step: verifying the exported high-level FINN-ONNX model.\n",
     "Another goal of this notebook is to introduce you to the concept of *graph transformations* -- we'll be applying some transformations to the graph to make it executable for verification. \n",
     "Once this model is sucessfully verified, we'll generate an FPGA accelerator from it in the next notebook."
@@ -41,7 +41,7 @@
    "source": [
     "## Outline\n",
     "-------------\n",
-    "1. [Import model into FINN with ModelWrapper](#brevitas_import_visualization)\n",
+    "1. [Convert model from QONNX to FINN-ONNX](#brevitas_import_visualization)\n",
     "2. [Network preparations: Tidy-up transformations](#network_preparations)\n",
     "3. [Load the dataset and Brevitas model](#load_dataset) \n",
     "4. [Compare FINN and Brevitas execution](#compare_brevitas)"
@@ -51,7 +51,14 @@
    "cell_type": "markdown",
    "metadata": {},
    "source": [
-    "# 1. Import model into FINN with ModelWrapper and ConvertQONNXtoFINN <a id=\"brevitas_import_visualization\"></a>\n",
+    "# 1. Convert model from QONNX to FINN-ONNX <a id=\"brevitas_import_visualization\"></a>\n",
+    "\n",
+    "\n",
+    "To answer the question of why we need a conversion from QONNX to FINN-ONNX in the first place, it is important to note key differences between the three representations: ONNX, QONNX and FINN-ONNX.\n",
+    "\n",
+    "Currently, ONNX provides only limited support for quantizing data types, while QONNX and FINN-ONNX provide fully flexible quantization support. However the way in which they do differs: QONNX provides special node types called `Quant` which ingest weights or previous node output streams as inputs to produce quantized output streams. However, this node is not designed with dataflow architectures in mind, with each node instance only performing the quantization on one individual input stream. Meanwhile, FINN-ONNX has a special node type called `Thresholding`, which was designed with dataflow graph models in mind. Beyond, this, there are other node types which differ in FINN-ONNX as opposed to QONNX. Thus we need a conversion function, which we will explore in more detail shortly.\n",
+    "\n",
+    "Lastly, we want to emphasize that we use the uppercase naming (ONNX, QONNX, FINN-ONNX) for the intermediate representations (IR), while the lower case naming (onnx, qonnx, finn) are used to refer to the compiler toolchains themselves.\n",
     "\n",
     "\n",
     "## 1.1 Using ModelWrapper to load and observe a model\n",
@@ -74,7 +81,7 @@
     "ready_model_filename = model_dir + \"/cybsec-mlp-ready.onnx\"\n",
     "\n",
     "# ModelWrapper\n",
-    "model_for_sim = ModelWrapper(ready_model_filename)\n",
+    "model = ModelWrapper(ready_model_filename)\n",
     "\n",
     "print(\"Model loaded from %s\" % ready_model_filename)"
    ]
@@ -83,7 +90,7 @@
    "cell_type": "markdown",
    "metadata": {},
    "source": [
-    "Now that we have the model in .onnx format, we can look at some of the operations that were introduced, however we cannot use it in FINN just yet. To import it into FINN, we will need to use the ConvertQONNXtoFINN transformation. But before that, let us use some of the member functions exposed by `ModelWrapper` to see what kind of information we can extract from it and have a baseline to compare to when we do call the ConvertQONNXtoFINN transformation."
+    "To import the model into FINN, we will need to use the `ConvertQONNXtoFINN` transformation. But before that, let us use some of the member functions exposed by `ModelWrapper` to see what kind of information we can extract from it and have a baseline to compare to when we do call the `ConvertQONNXtoFINN` transformation."
    ]
   },
   {
@@ -92,14 +99,14 @@
    "metadata": {},
    "outputs": [],
    "source": [
-    "dir(model_for_sim)"
+    "dir(model)"
    ]
   },
   {
    "cell_type": "markdown",
    "metadata": {},
    "source": [
-    "Many of these helper functions relate to extracting information about the structure and properties of the ONNX model. You can find out more about examining and manipulating ONNX models programmatically in [this tutorial](../../basics/0_how_to_work_with_onnx.ipynb), but we'll show a few basic functions here. For instance, we can extract the shape and datatype annotation for various tensors in the graph, as well as information related to the operation types associated with each node."
+    "Many of these helper functions relate to extracting information about the structure and properties of the ONNX model. You can find out more about examining and manipulating ONNX models programmatically in [this tutorial](../../basics/0_how_to_work_with_onnx.ipynb), but we'll show a few basic functions here. For instance, we can extract the shape and datatype annotation for various tensors in the graph, as well as information related to the operation types associated with each node. We will do this now."
    ]
   },
   {
@@ -110,27 +117,27 @@
    "source": [
     "from qonnx.core.datatype import DataType\n",
     "\n",
-    "finnonnx_in_tensor_name = model_for_sim.graph.input[0].name\n",
-    "finnonnx_out_tensor_name = model_for_sim.graph.output[0].name\n",
-    "print(\"Input tensor name: %s\" % finnonnx_in_tensor_name)\n",
-    "print(\"Output tensor name: %s\" % finnonnx_out_tensor_name)\n",
-    "finnonnx_model_in_shape = model_for_sim.get_tensor_shape(finnonnx_in_tensor_name)\n",
-    "finnonnx_model_out_shape = model_for_sim.get_tensor_shape(finnonnx_out_tensor_name)\n",
-    "print(\"Input tensor shape: %s\" % str(finnonnx_model_in_shape))\n",
-    "print(\"Output tensor shape: %s\" % str(finnonnx_model_out_shape))\n",
-    "finnonnx_model_in_dt = model_for_sim.get_tensor_datatype(finnonnx_in_tensor_name)\n",
-    "finnonnx_model_out_dt = model_for_sim.get_tensor_datatype(finnonnx_out_tensor_name)\n",
-    "print(\"Input tensor datatype: %s\" % str(finnonnx_model_in_dt.name))\n",
-    "print(\"Output tensor datatype: %s\" % str(finnonnx_model_out_dt.name))\n",
+    "in_tensor_name = model.graph.input[0].name\n",
+    "out_tensor_name = model.graph.output[0].name\n",
+    "print(\"Input tensor name: %s\" % in_tensor_name)\n",
+    "print(\"Output tensor name: %s\" % out_tensor_name)\n",
+    "model_in_shape = model.get_tensor_shape(in_tensor_name)\n",
+    "model_out_shape = model.get_tensor_shape(out_tensor_name)\n",
+    "print(\"Input tensor shape: %s\" % str(model_in_shape))\n",
+    "print(\"Output tensor shape: %s\" % str(model_out_shape))\n",
+    "model_in_dt = model.get_tensor_datatype(in_tensor_name)\n",
+    "model_out_dt = model.get_tensor_datatype(out_tensor_name)\n",
+    "print(\"Input tensor datatype: %s\" % str(model_in_dt.name))\n",
+    "print(\"Output tensor datatype: %s\" % str(model_out_dt.name))\n",
     "print(\"List of node operator types in the graph: \")\n",
-    "print([x.op_type for x in model_for_sim.graph.node])"
+    "print([x.op_type for x in model.graph.node])"
    ]
   },
   {
    "cell_type": "markdown",
    "metadata": {},
    "source": [
-    "Note that the output tensor is (as of yet) marked as a float32 value, even though we know the output is binary. This will get resolved when we call the `ConvertQONNXtoFINN` transformation, which internally features an `Infer_Data_Types` transformation."
+    "Note that the input and output tensors are (as of yet) marked as a float32 values, even though we know they are binary. The output datatype will get inferred when we call the `ConvertQONNXtoFINN` transformation, which internally features an `InferDataTypes` transformation, while the input we will adjust manually with the `set_tensor_datatype` function."
    ]
   },
   {
@@ -139,7 +146,7 @@
    "source": [
     "## 1.2 Converting to from QONNX to FINN using ConvertQONNXtoFINN\n",
     "\n",
-    "At this point, we would like to move from the QONNX intermediate representation (IR) onto the FINN IR. We can do this by using the ConvertQONNXtoFINN() function from FINN on a QONNX model."
+    "At this point, we would like to move from the QONNX IR onto the FINN-ONNX IR. We can do this by using the `ConvertQONNXtoFINN()` function on a QONNX model."
    ]
   },
   {
@@ -150,10 +157,10 @@
    "source": [
     "\n",
     "# Setting the input datatype explicitly because it doesn't get derived from the export function\n",
-    "model_for_sim.set_tensor_datatype(model_for_sim.graph.input[0].name, DataType[\"BIPOLAR\"])\n",
+    "model.set_tensor_datatype(model.graph.input[0].name, DataType[\"BIPOLAR\"])\n",
     "\n",
     "# Calling the actual QONNX -> FINN transformation\n",
-    "model_for_sim = model_for_sim.transform(ConvertQONNXtoFINN())"
+    "model = model.transform(ConvertQONNXtoFINN())"
    ]
   },
   {
@@ -171,20 +178,20 @@
    "source": [
     "from qonnx.core.datatype import DataType\n",
     "\n",
-    "finnonnx_in_tensor_name = model_for_sim.graph.input[0].name\n",
-    "finnonnx_out_tensor_name = model_for_sim.graph.output[0].name\n",
-    "print(\"Input tensor name: %s\" % finnonnx_in_tensor_name)\n",
-    "print(\"Output tensor name: %s\" % finnonnx_out_tensor_name)\n",
-    "finnonnx_model_in_shape = model_for_sim.get_tensor_shape(finnonnx_in_tensor_name)\n",
-    "finnonnx_model_out_shape = model_for_sim.get_tensor_shape(finnonnx_out_tensor_name)\n",
-    "print(\"Input tensor shape: %s\" % str(finnonnx_model_in_shape))\n",
-    "print(\"Output tensor shape: %s\" % str(finnonnx_model_out_shape))\n",
-    "finnonnx_model_in_dt = model_for_sim.get_tensor_datatype(finnonnx_in_tensor_name)\n",
-    "finnonnx_model_out_dt = model_for_sim.get_tensor_datatype(finnonnx_out_tensor_name)\n",
-    "print(\"Input tensor datatype: %s\" % str(finnonnx_model_in_dt.name))\n",
-    "print(\"Output tensor datatype: %s\" % str(finnonnx_model_out_dt.name))\n",
+    "in_tensor_name = model.graph.input[0].name\n",
+    "out_tensor_name = model.graph.output[0].name\n",
+    "print(\"Input tensor name: %s\" % in_tensor_name)\n",
+    "print(\"Output tensor name: %s\" % out_tensor_name)\n",
+    "model_in_shape = model.get_tensor_shape(in_tensor_name)\n",
+    "model_out_shape = model.get_tensor_shape(out_tensor_name)\n",
+    "print(\"Input tensor shape: %s\" % str(model_in_shape))\n",
+    "print(\"Output tensor shape: %s\" % str(model_out_shape))\n",
+    "model_in_dt = model.get_tensor_datatype(in_tensor_name)\n",
+    "model_out_dt = model.get_tensor_datatype(out_tensor_name)\n",
+    "print(\"Input tensor datatype: %s\" % str(model_in_dt.name))\n",
+    "print(\"Output tensor datatype: %s\" % str(model_out_dt.name))\n",
     "print(\"List of node operator types in the graph: \")\n",
-    "print([x.op_type for x in model_for_sim.graph.node])"
+    "print([x.op_type for x in model.graph.node])"
    ]
   },
   {
@@ -208,7 +215,7 @@
    "cell_type": "markdown",
    "metadata": {},
    "source": [
-    "As we can see, `ConvertQONNXtoFINN`  turned GEMM operation into MatMuls, turned ReLU nodes into Thresholding nodes and so forth. However, these nodes do need further transformations before they can be turned into FPGA operators, which we handle in the next step."
+    "As we can see, `ConvertQONNXtoFINN`  turned `Gemm` operation into `MatMuls` using the `GemmToMatMul()` transform and turned `Quant` nodes into `Thresholding` nodes using the `ConvertQuantActToMultiThreshold()` transform to name a few. However, these nodes do need further transformations before they can be turned into FPGA operators, which we handle in the next step."
    ]
   },
   {
@@ -233,15 +240,15 @@
     "from qonnx.transformation.infer_datatypes import InferDataTypes\n",
     "from qonnx.transformation.fold_constants import FoldConstants\n",
     "\n",
-    "model_for_sim = model_for_sim.transform(InferShapes())\n",
-    "model_for_sim = model_for_sim.transform(FoldConstants())\n",
-    "model_for_sim = model_for_sim.transform(GiveUniqueNodeNames())\n",
-    "model_for_sim = model_for_sim.transform(GiveReadableTensorNames())\n",
-    "model_for_sim = model_for_sim.transform(InferDataTypes())\n",
-    "model_for_sim = model_for_sim.transform(RemoveStaticGraphInputs())\n",
+    "model = model.transform(InferShapes())\n",
+    "model = model.transform(FoldConstants())\n",
+    "model = model.transform(GiveUniqueNodeNames())\n",
+    "model = model.transform(GiveReadableTensorNames())\n",
+    "model = model.transform(InferDataTypes())\n",
+    "model = model.transform(RemoveStaticGraphInputs())\n",
     "\n",
     "verif_model_filename = model_dir + \"/cybsec-mlp-verification.onnx\"\n",
-    "model_for_sim.save(verif_model_filename)"
+    "model.save(verif_model_filename)"
    ]
   },
   {
@@ -396,22 +403,22 @@
     "import finn.core.onnx_exec as oxe\n",
     "\n",
     "def inference_with_finn_onnx(current_inp):\n",
-    "    finnonnx_in_tensor_name = model_for_sim.graph.input[0].name\n",
-    "    finnonnx_model_in_shape = model_for_sim.get_tensor_shape(finnonnx_in_tensor_name)\n",
-    "    finnonnx_out_tensor_name = model_for_sim.graph.output[0].name\n",
+    "    in_tensor_name = model.graph.input[0].name\n",
+    "    model_in_shape = model.get_tensor_shape(in_tensor_name)\n",
+    "    out_tensor_name = model.graph.output[0].name\n",
     "    # convert input to numpy for FINN\n",
     "    current_inp = current_inp.detach().numpy()\n",
     "    # add padding and re-scale to bipolar\n",
     "    current_inp = np.pad(current_inp, [(0, 0), (0, 7)])\n",
     "    current_inp = 2*current_inp-1\n",
     "    # reshape to expected input (add 1 for batch dimension)\n",
-    "    current_inp = current_inp.reshape(finnonnx_model_in_shape)\n",
+    "    current_inp = current_inp.reshape(model_in_shape)\n",
     "    # create the input dictionary\n",
-    "    input_dict = {finnonnx_in_tensor_name : current_inp} \n",
+    "    input_dict = {in_tensor_name : current_inp} \n",
     "    # run with FINN's execute_onnx\n",
-    "    output_dict = oxe.execute_onnx(model_for_sim, input_dict)\n",
+    "    output_dict = oxe.execute_onnx(model, input_dict)\n",
     "    #get the output tensor\n",
-    "    finn_output = output_dict[finnonnx_out_tensor_name] \n",
+    "    finn_output = output_dict[out_tensor_name] \n",
     "    return finn_output"
    ]
   },

From 84cbc0cbbadf1633033a4079b623114cfc459fee Mon Sep 17 00:00:00 2001
From: Christoph Berganski <christoph.berganski@gmail.com>
Date: Wed, 7 Aug 2024 11:34:05 +0200
Subject: [PATCH 08/51] [ConvolutionInputGenerator] Make infer_node_datatype
 update attributes

Without updating the datatype attributes of the node, there might be a
mismatch between tensor annotations (the actual datatype) and the type
assumed by the node. This becomes an issue for example when querying the
bit-width of the stream when inserting data-width converters.
---
 .../fpgadataflow/convolutioninputgenerator.py | 22 +++++++++++++++++++
 1 file changed, 22 insertions(+)

diff --git a/src/finn/custom_op/fpgadataflow/convolutioninputgenerator.py b/src/finn/custom_op/fpgadataflow/convolutioninputgenerator.py
index 96f49069c7..1fb4940fb4 100644
--- a/src/finn/custom_op/fpgadataflow/convolutioninputgenerator.py
+++ b/src/finn/custom_op/fpgadataflow/convolutioninputgenerator.py
@@ -27,6 +27,7 @@
 # OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
 
 import numpy as np
+import warnings
 from onnx import TensorProto, helper
 from qonnx.core.datatype import DataType
 from qonnx.core.modelwrapper import ModelWrapper
@@ -141,6 +142,27 @@ def infer_node_datatype(self, model):
         node = self.onnx_node
         # data type stays the same
         dtype = model.get_tensor_datatype(node.input[0])
+
+        # Test for changing input datatype
+        if dtype != self.get_nodeattr("inputDataType"):
+            # Issue a warning message
+            warnings.warn(
+                f"{node.name}: inputDataType changing from"
+                f" {self.get_nodeattr('inputDataType')} to {dtype}"
+            )
+            # Set the new datatype attribute
+            self.set_nodeattr("inputDataType", dtype.name)
+
+        # Test for changing output datatype
+        if dtype != self.get_nodeattr("outputDataType"):
+            # Issue a warning message
+            warnings.warn(
+                f"{node.name}: outputDataType changing from"
+                f" {self.get_nodeattr('outputDataType')} to {dtype}"
+            )
+            # Set the new datatype attribute
+            self.set_nodeattr("outputDataType", dtype.name)
+        # Propagate the datatype through the model graph
         model.set_tensor_datatype(node.output[0], dtype)
 
     def verify_node(self):

From a78f23b2d368cf94d32d5084f34f81affd44f516 Mon Sep 17 00:00:00 2001
From: Hannah Yan <harbingerpasta11@gmail.com>
Date: Wed, 7 Aug 2024 13:59:28 +0100
Subject: [PATCH 09/51] Increased liveness threshold for
 verify_step_stitched_ip_rtlsim

---
 src/finn/builder/build_dataflow_steps.py | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/src/finn/builder/build_dataflow_steps.py b/src/finn/builder/build_dataflow_steps.py
index b8ed8daec7..bdbcc53d83 100644
--- a/src/finn/builder/build_dataflow_steps.py
+++ b/src/finn/builder/build_dataflow_steps.py
@@ -666,7 +666,7 @@ def step_create_stitched_ip(model: ModelWrapper, cfg: DataflowBuildConfig):
         estimate_network_performance = verify_model.analysis(dataflow_performance)
         prev_liveness = pyverilate_get_liveness_threshold_cycles()
         os.environ["LIVENESS_THRESHOLD"] = str(
-            int(estimate_network_performance["critical_path_cycles"])
+            int(estimate_network_performance["critical_path_cycles"] * 1.1)
         )
         if cfg.verify_save_rtlsim_waveforms:
             report_dir = cfg.output_dir + "/report"

From 6e4115347a1ca3bc20058b26381c8c08b3c46284 Mon Sep 17 00:00:00 2001
From: auphelia <jakobapk@web.de>
Date: Thu, 8 Aug 2024 09:39:19 +0100
Subject: [PATCH 10/51] [NBs] Update text for cybersecurity notebooks

---
 .../1-train-mlp-with-brevitas.ipynb           | 13 +++--------
 .../2-import-into-finn-and-verify.ipynb       | 22 +++++++++++++------
 2 files changed, 18 insertions(+), 17 deletions(-)

diff --git a/notebooks/end2end_example/cybersecurity/1-train-mlp-with-brevitas.ipynb b/notebooks/end2end_example/cybersecurity/1-train-mlp-with-brevitas.ipynb
index 73bb009e2d..3f8d65497b 100644
--- a/notebooks/end2end_example/cybersecurity/1-train-mlp-with-brevitas.ipynb
+++ b/notebooks/end2end_example/cybersecurity/1-train-mlp-with-brevitas.ipynb
@@ -53,7 +53,7 @@
     "    * [(Option 1) Train the Model from Scratch](#train_scratch)\n",
     "    * [(Option 2) Load Pre-Trained Parameters](#load_pretrained)\n",
     "* [Network Surgery Before Export](#network_surgery)\n",
-    "* [Export to QONNX and Conversion to FINN-ONNX](#export_qonnx)"
+    "* [Export to QONNX](#export_qonnx)"
    ]
   },
   {
@@ -667,12 +667,12 @@
    "cell_type": "markdown",
    "metadata": {},
    "source": [
-    "# Export to QONNX and Conversion to FINN-ONNX <a id=\"export_qonnx\" ></a>\n",
+    "# Export to QONNX <a id=\"export_qonnx\" ></a>\n",
     "\n",
     "\n",
     "[ONNX](https://onnx.ai/) is an open format built to represent machine learning models, and the FINN compiler expects an ONNX model as input. We'll now export our network into ONNX to be imported and used in FINN for the next notebooks. Note that the particular ONNX representation used for FINN differs from standard ONNX, you can read more about this [here](https://finn.readthedocs.io/en/latest/internals.html#intermediate-representation-finn-onnx).\n",
     "\n",
-    "You can see below how we export a trained network in Brevitas into a FINN-compatible ONNX representation (QONNX). QONNX is the format we can export from Brevitas, to feed it into the FINN compiler, we will need to make a conversion to the FINN-ONNX format which is the intermediate representation the compiler works on. The conversion of the FINN-ONNX format is a FINN compiler transformation and to be able to apply it to our model, we will need to wrap it into [ModelWrapper](https://finn.readthedocs.io/en/latest/internals.html#modelwrapper). This is a wrapper around the ONNX model which provides several helper functions to make it easier to work with the model. Then we can call the conversion function to obtain the model in FINN-ONNX format. This will be done in the next notebook. For now, we simply export and save the QONNX model."
+    "You can see below how we export a trained network in Brevitas into a FINN-compatible ONNX representation (QONNX). QONNX is the format we can export from Brevitas, to feed it into the FINN compiler, we will need to make a conversion to the FINN-ONNX format which is the intermediate representation the compiler works on. This will be done in the next notebook. For now, we simply export and save the QONNX model."
    ]
   },
   {
@@ -744,13 +744,6 @@
     "## That's it! <a id=\"thats_it\" ></a>\n",
     "You created, trained and tested a quantized MLP that is ready to be loaded into FINN, congratulations! You can now proceed to the next notebook."
    ]
-  },
-  {
-   "cell_type": "code",
-   "execution_count": null,
-   "metadata": {},
-   "outputs": [],
-   "source": []
   }
  ],
  "metadata": {
diff --git a/notebooks/end2end_example/cybersecurity/2-import-into-finn-and-verify.ipynb b/notebooks/end2end_example/cybersecurity/2-import-into-finn-and-verify.ipynb
index 522a25f5c7..70f1acae0a 100644
--- a/notebooks/end2end_example/cybersecurity/2-import-into-finn-and-verify.ipynb
+++ b/notebooks/end2end_example/cybersecurity/2-import-into-finn-and-verify.ipynb
@@ -54,11 +54,13 @@
     "# 1. Convert model from QONNX to FINN-ONNX <a id=\"brevitas_import_visualization\"></a>\n",
     "\n",
     "\n",
-    "To answer the question of why we need a conversion from QONNX to FINN-ONNX in the first place, it is important to note key differences between the three representations: ONNX, QONNX and FINN-ONNX.\n",
+    "Even though the input to finn is the QONNX format, an IR called FINN-ONNX is used inside the compiler. In this part of the notebook, we show how to convert QONNX to FINN-ONNX and explain the key differences between the three representations: ONNX, QONNX and FINN-ONNX.\n",
     "\n",
-    "Currently, ONNX provides only limited support for quantizing data types, while QONNX and FINN-ONNX provide fully flexible quantization support. However the way in which they do differs: QONNX provides special node types called `Quant` which ingest weights or previous node output streams as inputs to produce quantized output streams. However, this node is not designed with dataflow architectures in mind, with each node instance only performing the quantization on one individual input stream. Meanwhile, FINN-ONNX has a special node type called `Thresholding`, which was designed with dataflow graph models in mind. Beyond, this, there are other node types which differ in FINN-ONNX as opposed to QONNX. Thus we need a conversion function, which we will explore in more detail shortly.\n",
+    "QONNX and FINN-ONNX are extensions to the standard ONNX format. Currently, ONNX provides only limited support for expressing quantization, while QONNX and FINN-ONNX provide fully flexible quantization support. However the way in which they do differs: QONNX provides special node types called `Quant` which ingest weights or previous node output streams as inputs to produce quantized output streams. Meanwhile, FINN-ONNX uses tensor annotation to express quantization and has a special node type called `MultiThreshold`, which implements quantization on the activation data path.\n",
     "\n",
-    "Lastly, we want to emphasize that we use the uppercase naming (ONNX, QONNX, FINN-ONNX) for the intermediate representations (IR), while the lower case naming (onnx, qonnx, finn) are used to refer to the compiler toolchains themselves.\n",
+    "Beyond, this, there are other node types which differ in FINN-ONNX as opposed to QONNX. Thus we need a conversion function, which we will explore in more detail shortly.\n",
+    "\n",
+    "Lastly, we want to emphasize that we use the uppercase naming (ONNX, QONNX, FINN-ONNX) for the intermediate representations (IR), while the lower case naming (onnx, qonnx, finn) are usually used to refer to the compiler toolkits themselves.\n",
     "\n",
     "\n",
     "## 1.1 Using ModelWrapper to load and observe a model\n",
@@ -144,7 +146,7 @@
    "cell_type": "markdown",
    "metadata": {},
    "source": [
-    "## 1.2 Converting to from QONNX to FINN using ConvertQONNXtoFINN\n",
+    "## 1.2 Converting to from QONNX to FINN-ONNX using ConvertQONNXtoFINN\n",
     "\n",
     "At this point, we would like to move from the QONNX IR onto the FINN-ONNX IR. We can do this by using the `ConvertQONNXtoFINN()` function on a QONNX model."
    ]
@@ -155,7 +157,6 @@
    "metadata": {},
    "outputs": [],
    "source": [
-    "\n",
     "# Setting the input datatype explicitly because it doesn't get derived from the export function\n",
     "model.set_tensor_datatype(model.graph.input[0].name, DataType[\"BIPOLAR\"])\n",
     "\n",
@@ -198,7 +199,14 @@
    "cell_type": "markdown",
    "metadata": {},
    "source": [
-    "Notice that the input and output tensor datatypes now correctly show `BIPOLAR` while the operator types have also heavily changed compared to the QONNX version. This is because in FINN, we use operators more suitable for FPGA implementations. `ConvertQONNXtoFINN` internally called many transformations which change the operators in such a manner and we can actually peek at the source code to see them using the `showSrc` function."
+    "Notice that the input and output tensor datatypes now correctly show `BIPOLAR` while the operator types have also heavily changed compared to the QONNX version. "
+   ]
+  },
+  {
+   "cell_type": "markdown",
+   "metadata": {},
+   "source": [
+    "`ConvertQONNXtoFINN` internally called many transformations which change the operators in such a manner and we can actually peek at the source code to see them using the `showSrc` function."
    ]
   },
   {
@@ -215,7 +223,7 @@
    "cell_type": "markdown",
    "metadata": {},
    "source": [
-    "As we can see, `ConvertQONNXtoFINN`  turned `Gemm` operation into `MatMuls` using the `GemmToMatMul()` transform and turned `Quant` nodes into `Thresholding` nodes using the `ConvertQuantActToMultiThreshold()` transform to name a few. However, these nodes do need further transformations before they can be turned into FPGA operators, which we handle in the next step."
+    "As we can see, `ConvertQONNXtoFINN`  turned `Gemm` operation into `MatMuls` using the `GemmToMatMul()` transform and turned `Quant` nodes into `MultiThreshold` nodes using the `ConvertQuantActToMultiThreshold()` transform to name a few. However, these nodes do need further transformations before they can be turned into FPGA operators."
    ]
   },
   {

From 2d9deb58015dc890899835302ff38eda8f8653be Mon Sep 17 00:00:00 2001
From: lstasytis <l.stasytis1@gmail.com>
Date: Tue, 13 Aug 2024 12:35:16 +0100
Subject: [PATCH 11/51] [Deps] Update onnx commit hash

---
 fetch-repos.sh | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/fetch-repos.sh b/fetch-repos.sh
index 2033973f2a..6ce9ad76d4 100755
--- a/fetch-repos.sh
+++ b/fetch-repos.sh
@@ -27,7 +27,7 @@
 # OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
 # OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
 
-QONNX_COMMIT="fd61cfeebbdaba351abf7e9d54cd785d7776fa4f"
+QONNX_COMMIT="2281a777d84aa5cbd7469085c2e534fb4a03ccf9"
 FINN_EXP_COMMIT="0724be21111a21f0d81a072fccc1c446e053f851"
 BREVITAS_COMMIT="84f42259ec869eb151af4cb8a8b23ad925f493db"
 PYVERILATOR_COMMIT="ce0a08c20cb8c1d1e84181d6f392390f846adbd1"

From f32cce868d373e97b80a071c0520cea6b58aa4aa Mon Sep 17 00:00:00 2001
From: lstasytis <l.stasytis1@gmail.com>
Date: Tue, 13 Aug 2024 13:55:28 +0100
Subject: [PATCH 12/51] Updated auto_pad_to_explicit_padding function path

---
 .../fpgadataflow/infer_pixel_padding_deconv.py              | 5 ++---
 .../test_fpgadataflow_convinputgenerator_rtl_dynamic.py     | 6 +++---
 2 files changed, 5 insertions(+), 6 deletions(-)

diff --git a/src/finn/transformation/fpgadataflow/infer_pixel_padding_deconv.py b/src/finn/transformation/fpgadataflow/infer_pixel_padding_deconv.py
index 8dbf7071fc..e1dcf1dde5 100644
--- a/src/finn/transformation/fpgadataflow/infer_pixel_padding_deconv.py
+++ b/src/finn/transformation/fpgadataflow/infer_pixel_padding_deconv.py
@@ -2,8 +2,7 @@
 import warnings
 from onnx import TensorProto, helper
 from qonnx.transformation.base import Transformation
-from qonnx.transformation.lower_convs_to_matmul import _auto_pad_to_explicit_padding
-from qonnx.util.basic import get_by_name
+from qonnx.util.basic import auto_pad_to_explicit_padding, get_by_name
 
 
 class InferPixelPaddingDeconv(Transformation):
@@ -61,7 +60,7 @@ def apply(self, model):
                         # use specified padding
                         pad = get_by_name(n.attribute, "pads").ints
                     else:
-                        pad = _auto_pad_to_explicit_padding(
+                        pad = auto_pad_to_explicit_padding(
                             auto_pad,
                             ifm_dim_h,
                             ifm_dim_w,
diff --git a/tests/fpgadataflow/test_fpgadataflow_convinputgenerator_rtl_dynamic.py b/tests/fpgadataflow/test_fpgadataflow_convinputgenerator_rtl_dynamic.py
index 9c45b06f4a..02c86d9972 100644
--- a/tests/fpgadataflow/test_fpgadataflow_convinputgenerator_rtl_dynamic.py
+++ b/tests/fpgadataflow/test_fpgadataflow_convinputgenerator_rtl_dynamic.py
@@ -43,8 +43,8 @@
 from qonnx.transformation.infer_shapes import InferShapes
 from qonnx.transformation.lower_convs_to_matmul import (
     LowerConvsToMatMul,
-    _auto_pad_to_explicit_padding,
 )
+from qonnx.util.basic import auto_pad_to_explicit_padding
 from qonnx.util.basic import gen_finn_dt_tensor, get_by_name, qonnx_make_model
 
 import finn.core.onnx_exec as oxe
@@ -69,11 +69,11 @@ def create_conv_model(idim_h, idim_w, ifm, k, stride, ofm, idt, wdt, pad_mode, d
     group = ifm if depthwise else 1
     group_str = str(group)
     ishp = (1, ifm, idim_h, idim_w)
-    pad_0 = _auto_pad_to_explicit_padding(pad_mode, idim_h, idim_w, k, k, stride, stride, 2)
+    pad_0 = auto_pad_to_explicit_padding(pad_mode, idim_h, idim_w, k, k, stride, stride, 2)
     int_dim_h = compute_conv_output_dim(idim_h, k, stride, total_pad=pad_0[0] + pad_0[2])
     int_dim_w = compute_conv_output_dim(idim_w, k, stride, total_pad=pad_0[1] + pad_0[3])
 
-    pad_1 = _auto_pad_to_explicit_padding(pad_mode, int_dim_h, int_dim_w, k, k, stride, stride, 2)
+    pad_1 = auto_pad_to_explicit_padding(pad_mode, int_dim_h, int_dim_w, k, k, stride, stride, 2)
     odim_h = compute_conv_output_dim(int_dim_h, k, stride, total_pad=pad_1[0] + pad_1[2])
     odim_w = compute_conv_output_dim(int_dim_w, k, stride, total_pad=pad_1[1] + pad_1[3])
     oshp = (1, ifm, odim_h, odim_w) if depthwise else (1, ofm, odim_h, odim_w)

From f71f4ba55b3fd6f0b39e9e5166b86a54d135e9ba Mon Sep 17 00:00:00 2001
From: lstasytis1 <lstasytis1@gmail.com>
Date: Tue, 13 Aug 2024 13:24:35 +0000
Subject: [PATCH 13/51] linted the commit

---
 ...test_fpgadataflow_convinputgenerator_rtl_dynamic.py | 10 ++++++----
 1 file changed, 6 insertions(+), 4 deletions(-)

diff --git a/tests/fpgadataflow/test_fpgadataflow_convinputgenerator_rtl_dynamic.py b/tests/fpgadataflow/test_fpgadataflow_convinputgenerator_rtl_dynamic.py
index 02c86d9972..26ce8f5f0e 100644
--- a/tests/fpgadataflow/test_fpgadataflow_convinputgenerator_rtl_dynamic.py
+++ b/tests/fpgadataflow/test_fpgadataflow_convinputgenerator_rtl_dynamic.py
@@ -41,11 +41,13 @@
 from qonnx.transformation.general import GiveReadableTensorNames, GiveUniqueNodeNames
 from qonnx.transformation.infer_datatypes import InferDataTypes
 from qonnx.transformation.infer_shapes import InferShapes
-from qonnx.transformation.lower_convs_to_matmul import (
-    LowerConvsToMatMul,
+from qonnx.transformation.lower_convs_to_matmul import LowerConvsToMatMul
+from qonnx.util.basic import (
+    auto_pad_to_explicit_padding,
+    gen_finn_dt_tensor,
+    get_by_name,
+    qonnx_make_model,
 )
-from qonnx.util.basic import auto_pad_to_explicit_padding
-from qonnx.util.basic import gen_finn_dt_tensor, get_by_name, qonnx_make_model
 
 import finn.core.onnx_exec as oxe
 import finn.transformation.fpgadataflow.convert_to_hw_layers as to_hw

From e22201f800a573b88d55f9b0024454a8e10fa0d4 Mon Sep 17 00:00:00 2001
From: auphelia <jakobapk@web.de>
Date: Fri, 16 Aug 2024 16:15:40 +0100
Subject: [PATCH 14/51] [HWop-MVAU] Ensure shape is compatible in execution
 function

---
 src/finn/custom_op/fpgadataflow/matrixvectoractivation.py | 2 ++
 1 file changed, 2 insertions(+)

diff --git a/src/finn/custom_op/fpgadataflow/matrixvectoractivation.py b/src/finn/custom_op/fpgadataflow/matrixvectoractivation.py
index 1c86ae7b7a..8f0a987bce 100644
--- a/src/finn/custom_op/fpgadataflow/matrixvectoractivation.py
+++ b/src/finn/custom_op/fpgadataflow/matrixvectoractivation.py
@@ -130,6 +130,8 @@ def get_nodeattr_types(self):
     def execute_node(self, context, graph):
         node = self.onnx_node
         in_act = context[node.input[0]]
+        # ensure that shape is compatible
+        in_act = in_act.reshape(self.get_normal_input_shape())
         mvau_w_init = [x for x in graph.initializer if x.name == node.input[1]][0]
         mvau_w = np_helper.to_array(mvau_w_init)
         # Matrix multiplication

From ec7be72196b5a4fa2e10cd2afcd83bada1977893 Mon Sep 17 00:00:00 2001
From: auphelia <jakobapk@web.de>
Date: Mon, 10 Jun 2024 16:19:31 +0100
Subject: [PATCH 15/51] [Transform] Skip broadcasting of thresholds in onnx
 conversion and extend conversion to hw layers

---
 .../fpgadataflow/convert_to_hw_layers.py      | 74 +++++++++++++++++--
 .../qonnx/qonnx_activation_handlers.py        |  9 ++-
 2 files changed, 71 insertions(+), 12 deletions(-)

diff --git a/src/finn/transformation/fpgadataflow/convert_to_hw_layers.py b/src/finn/transformation/fpgadataflow/convert_to_hw_layers.py
index e14181b140..ea5025a098 100644
--- a/src/finn/transformation/fpgadataflow/convert_to_hw_layers.py
+++ b/src/finn/transformation/fpgadataflow/convert_to_hw_layers.py
@@ -529,6 +529,60 @@ def apply(self, model):
         graph = model.graph
         node_ind = 0
         graph_modified = False
+        # check first if global input is split
+        successors = model.find_consumers(graph.input[0].name)
+        dt = model.get_tensor_datatype(graph.input[0].name)
+        if successors is not None and len(successors) >= 2 and dt.is_integer():
+            output_tensor = graph.input[0].name
+            n_outputs = len(successors)
+            dt = model.get_tensor_datatype(output_tensor)
+
+            # create clone tensors
+            out_shape = model.get_tensor_shape(output_tensor)
+            out_tensor_clones = []
+            for i in range(n_outputs):
+                clone = helper.make_tensor_value_info(
+                    model.make_new_valueinfo_name(), TensorProto.FLOAT, out_shape
+                )
+                model.graph.value_info.append(clone)
+                out_tensor_clones += [clone.name]
+
+            num_ch = int(out_shape[-1])
+            vecs = out_shape[:-1]
+
+            # create node with no parallelization first
+            pe = 1
+
+            dup_node = helper.make_node(
+                "DuplicateStreams",
+                [output_tensor],
+                out_tensor_clones,
+                domain="finn.custom_op.fpgadataflow",
+                backend="fpgadataflow",
+                NumChannels=num_ch,
+                PE=pe,
+                inputDataType=dt.name,
+                numInputVectors=vecs,
+                NumOutputStreams=n_outputs,
+                outFIFODepths=[2] * n_outputs,
+                name="DuplicateStreams_" + output_tensor,
+            )
+
+            graph.node.insert(0, dup_node)
+
+            # connect successors to out tensor clone
+            clone_idx = 0
+            for successor in successors:
+                for i, succ_input in enumerate(successor.input):
+                    if succ_input == output_tensor:
+                        successor.input[i] = out_tensor_clones[clone_idx]
+                        clone_idx += 1
+                        # if one node has multiple connections to the same output
+                        # find_direct_successors will return one node per input
+                        # so break the inner loop will result in correct behaviour
+                        break
+            graph_modified = True
+
         for node in graph.node:
             node_ind += 1
             successors = model.find_consumers(node.output[0])
@@ -1206,7 +1260,7 @@ def apply(self, model):
         graph_modified = False
         for node in graph.node:
             node_ind += 1
-            if node.op_type == "Sub":
+            if node.op_type in ["Sub", "Add"]:
                 in0 = node.input[0]
                 in1 = node.input[1]
                 result = node.output[0]
@@ -1230,14 +1284,15 @@ def apply(self, model):
                 if not (idt0.is_integer() and idt1.is_integer()):
                     continue
 
-                eltwiseOp = "Sub"
+                eltwiseOp = node.op_type
                 nodes_to_remove = [node]
-                # look for a downstream Abs node
-                res_consumer = model.find_consumer(result)
-                if (res_consumer is not None) and (res_consumer.op_type == "Abs"):
-                    eltwiseOp = "AbsDiff"
-                    result = res_consumer.output[0]
-                    nodes_to_remove.append(res_consumer)
+                if node.op_type == "Sub":
+                    # look for a downstream Abs node
+                    res_consumer = model.find_consumer(result)
+                    if (res_consumer is not None) and (res_consumer.op_type == "Abs"):
+                        eltwiseOp = "AbsDiff"
+                        result = res_consumer.output[0]
+                        nodes_to_remove.append(res_consumer)
 
                 # check layout and convert if necessary
                 in0_layout = model.get_tensor_layout(in0)
@@ -1438,6 +1493,9 @@ def apply(self, model):
             if n.op_type == "MatMul" and model.get_tensor_sparsity(n.input[1]) is None:
                 mm_input = n.input[0]
                 mm_weight = n.input[1]
+                # if mm_weight is not constant, skip node
+                if model.get_initializer(n.input[1]) is None:
+                    continue
                 mm_output = n.output[0]
                 mm_in_shape = model.get_tensor_shape(mm_input)
                 mm_out_shape = model.get_tensor_shape(mm_output)
diff --git a/src/finn/transformation/qonnx/qonnx_activation_handlers.py b/src/finn/transformation/qonnx/qonnx_activation_handlers.py
index 323e391df4..92a9731c2a 100644
--- a/src/finn/transformation/qonnx/qonnx_activation_handlers.py
+++ b/src/finn/transformation/qonnx/qonnx_activation_handlers.py
@@ -537,11 +537,12 @@ def _calculate_thresholds(self):
                 for t in range(num_thresholds):
                     thresholds[c][t] = min_threshold[c] + step[c] * t
 
-            # ToDo: The index 1 needs to be changed to -1 for the channels last format
+            # currently only per tensor or per channel quantization is supported
             num_output_channels = self._model.get_tensor_shape(self._q_node.output[0])[1]
-            final_shape = (num_output_channels, num_thresholds)
-            if thresholds.shape != final_shape:
-                thresholds = np.broadcast_to(thresholds, final_shape)
+            assert (
+                thresholds.shape[0] == 1 or thresholds.shape[0] == num_output_channels
+            ), """Quant node cannot be converted to MultiThreshold because only
+                per tensor or per channel quantization supported."""
 
             return thresholds
 

From e1d1f63732c09863e753511dc7229396938528e0 Mon Sep 17 00:00:00 2001
From: auphelia <jakobapk@web.de>
Date: Thu, 22 Aug 2024 09:21:43 +0100
Subject: [PATCH 16/51] [Transform] Add comment to streamingeltwise conversion

---
 src/finn/transformation/fpgadataflow/convert_to_hw_layers.py | 4 ++--
 1 file changed, 2 insertions(+), 2 deletions(-)

diff --git a/src/finn/transformation/fpgadataflow/convert_to_hw_layers.py b/src/finn/transformation/fpgadataflow/convert_to_hw_layers.py
index ea5025a098..25a2032aeb 100644
--- a/src/finn/transformation/fpgadataflow/convert_to_hw_layers.py
+++ b/src/finn/transformation/fpgadataflow/convert_to_hw_layers.py
@@ -1251,8 +1251,8 @@ def apply(self, model):
 
 
 class InferStreamingEltwise(Transformation):
-    """Convert eltwise Sub or Sub -> Abs to StreamingEltwise layer
-    with SubEltwise or AbsDiffEltwise op."""
+    """Convert eltwise Add, Sub or Sub -> Abs to StreamingEltwise layer
+    with AddEltwise, SubEltwise or AbsDiffEltwise op."""
 
     def apply(self, model):
         graph = model.graph

From 188bf1715eb10e555da62f5d873408e43ea4dac8 Mon Sep 17 00:00:00 2001
From: auphelia <jakobapk@web.de>
Date: Thu, 20 Jun 2024 10:49:44 +0100
Subject: [PATCH 17/51] [Util] Add v80 to versal list

---
 src/finn/util/basic.py | 8 ++++----
 1 file changed, 4 insertions(+), 4 deletions(-)

diff --git a/src/finn/util/basic.py b/src/finn/util/basic.py
index 91c191962f..0cb029a888 100644
--- a/src/finn/util/basic.py
+++ b/src/finn/util/basic.py
@@ -292,10 +292,10 @@ def memutil(req_mem_spec, primitive_spec):
 
 def is_versal(fpgapart):
     """Returns whether board is part of the Versal family"""
-    return (
-        fpgapart[0:4] in ["xcvc", "xcve", "xcvp", "xcvm", "xqvc", "xqvm"]
-        or fpgapart[0:5] == "xqrvc"
-    )
+    return fpgapart[0:4] in ["xcvc", "xcve", "xcvp", "xcvm", "xqvc", "xqvm"] or fpgapart[0:5] in [
+        "xqrvc",
+        "xcv80",
+    ]
 
 
 def get_dsp_block(fpgapart):

From 380d2ac00a91600fb39a06dcd27eaf50a9fd4a6f Mon Sep 17 00:00:00 2001
From: auphelia <jakobapk@web.de>
Date: Mon, 8 Jul 2024 11:26:49 +0100
Subject: [PATCH 18/51] [RTL MVAU] Allow for 4bit compute with dsp48 for versal
 devices

---
 .../custom_op/fpgadataflow/rtl/matrixvectoractivation_rtl.py | 5 ++++-
 1 file changed, 4 insertions(+), 1 deletion(-)

diff --git a/src/finn/custom_op/fpgadataflow/rtl/matrixvectoractivation_rtl.py b/src/finn/custom_op/fpgadataflow/rtl/matrixvectoractivation_rtl.py
index 3e81aa93e0..d9ab501117 100644
--- a/src/finn/custom_op/fpgadataflow/rtl/matrixvectoractivation_rtl.py
+++ b/src/finn/custom_op/fpgadataflow/rtl/matrixvectoractivation_rtl.py
@@ -208,7 +208,10 @@ def _resolve_impl_style(self, dsp_block):
         weight_width = self.get_input_datatype(1).bitwidth()
 
         if dsp_block == "DSP58":
-            return "mvu_vvu_8sx9_dsp58"
+            if act_width <= 4 and weight_width <= 4:
+                return "mvu_4sx4u_dsp48e2"
+            else:
+                return "mvu_vvu_8sx9_dsp58"
         else:
             if act_width <= 4 and weight_width <= 4:
                 if dsp_block == "DSP48E1":

From 91cec4ee2df3adfdc41790e456a163fbc4b16585 Mon Sep 17 00:00:00 2001
From: auphelia <jakobapk@web.de>
Date: Tue, 27 Aug 2024 11:10:37 +0100
Subject: [PATCH 19/51] [Deps] Update Brevitas commit and update unpacking of
 brevitas tensor in test

---
 fetch-repos.sh                        | 2 +-
 tests/brevitas/test_brevitas_debug.py | 3 ++-
 2 files changed, 3 insertions(+), 2 deletions(-)

diff --git a/fetch-repos.sh b/fetch-repos.sh
index 6ce9ad76d4..1d7d86b71b 100755
--- a/fetch-repos.sh
+++ b/fetch-repos.sh
@@ -29,7 +29,7 @@
 
 QONNX_COMMIT="2281a777d84aa5cbd7469085c2e534fb4a03ccf9"
 FINN_EXP_COMMIT="0724be21111a21f0d81a072fccc1c446e053f851"
-BREVITAS_COMMIT="84f42259ec869eb151af4cb8a8b23ad925f493db"
+BREVITAS_COMMIT="89fca2f56b57650e77b8e400f9e579c065186ccd"
 PYVERILATOR_COMMIT="ce0a08c20cb8c1d1e84181d6f392390f846adbd1"
 CNPY_COMMIT="4e8810b1a8637695171ed346ce68f6984e585ef4"
 HLSLIB_COMMIT="16e5847a5e3ef76cffe84c8fad2f010d593457d3"
diff --git a/tests/brevitas/test_brevitas_debug.py b/tests/brevitas/test_brevitas_debug.py
index d6879a727b..3d059a6856 100644
--- a/tests/brevitas/test_brevitas_debug.py
+++ b/tests/brevitas/test_brevitas_debug.py
@@ -35,6 +35,7 @@
 import os
 import torch
 from brevitas.export import export_qonnx
+from brevitas.quant_tensor import _unpack_quant_tensor
 from pkgutil import get_data
 from qonnx.core.modelwrapper import ModelWrapper
 from qonnx.util.cleanup import cleanup as qonnx_cleanup
@@ -90,7 +91,7 @@ def test_brevitas_debug(QONNX_FINN_conversion):
     else:
         assert len(names_common) == 8
     for dbg_name in names_common:
-        tensor_pytorch = dbg_hook.values[dbg_name].value.detach().numpy()
+        tensor_pytorch = _unpack_quant_tensor(dbg_hook.values[dbg_name]).detach().numpy()
         tensor_finn = output_dict[dbg_name]
         assert np.isclose(tensor_finn, tensor_pytorch, atol=1e-5).all()
     os.remove(finn_onnx)

From f6b1e2b6bb01428f96701c467ce76d774bda2c6c Mon Sep 17 00:00:00 2001
From: auphelia <jakobapk@web.de>
Date: Thu, 5 Sep 2024 10:49:10 +0100
Subject: [PATCH 20/51] [Deps] Update brevitas commit hash

---
 fetch-repos.sh | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/fetch-repos.sh b/fetch-repos.sh
index 1d7d86b71b..a4fc124fa4 100755
--- a/fetch-repos.sh
+++ b/fetch-repos.sh
@@ -29,7 +29,7 @@
 
 QONNX_COMMIT="2281a777d84aa5cbd7469085c2e534fb4a03ccf9"
 FINN_EXP_COMMIT="0724be21111a21f0d81a072fccc1c446e053f851"
-BREVITAS_COMMIT="89fca2f56b57650e77b8e400f9e579c065186ccd"
+BREVITAS_COMMIT="d4834bd2a0fad3c1fbc0ff7e1346d5dcb3797ea4"
 PYVERILATOR_COMMIT="ce0a08c20cb8c1d1e84181d6f392390f846adbd1"
 CNPY_COMMIT="4e8810b1a8637695171ed346ce68f6984e585ef4"
 HLSLIB_COMMIT="16e5847a5e3ef76cffe84c8fad2f010d593457d3"

From 4a776ec99504bee33f20c5ec162b5828e222a14c Mon Sep 17 00:00:00 2001
From: Remo Senekowitsch <senk@zhaw.ch>
Date: Wed, 4 Sep 2024 10:45:29 +0200
Subject: [PATCH 21/51] Update links to finn-base repo

Signed-off-by: Remo Senekowitsch <senk@zhaw.ch>
---
 docs/finn/faq.rst                                               | 2 +-
 .../cybersecurity/3-build-accelerator-with-finn.ipynb           | 2 +-
 2 files changed, 2 insertions(+), 2 deletions(-)

diff --git a/docs/finn/faq.rst b/docs/finn/faq.rst
index 70c2f24ed2..0d643feba3 100644
--- a/docs/finn/faq.rst
+++ b/docs/finn/faq.rst
@@ -81,7 +81,7 @@ Which data layout do FINN-generated accelerators use? Big-endian? Little-endian?
     If you need to do this manually, first examine how the `FINN PYNQ Python drivers <https://github.com/Xilinx/finn-examples/blob/main/finn_examples/driver.py#L379>`_ do this – notice how the input data is
     first reshaped to create the “folded input shape” that reflects the word size of the first layer based on how much it
     was parallelized, then data packing is applied to obtain a raw byte array (with some reversals going on) that can be
-    fed directly to the hardware. Another example of this is the `npy_to_rtlsim_input <https://github.com/Xilinx/finn-base/blob/dev/src/finn/util/data_packing.py#L289>`_ function, which converts npy arrays to lists of Python arbitrary-precision integers that we feed into pyverilator for rtl simulation.
+    fed directly to the hardware. Another example of this is the `npy_to_rtlsim_input <https://github.com/Xilinx/finn/blob/dev/src/finn/util/data_packing.py#L284>`_ function, which converts npy arrays to lists of Python arbitrary-precision integers that we feed into pyverilator for rtl simulation.
 
 Why does FIFO sizing take so long for my network? Is something wrong?
     The automatic FIFO sizing in FINN can take quite long. It unfortunately doesn’t really parallelize on multiple cores since
diff --git a/notebooks/end2end_example/cybersecurity/3-build-accelerator-with-finn.ipynb b/notebooks/end2end_example/cybersecurity/3-build-accelerator-with-finn.ipynb
index 73cd25cf20..28702d0286 100644
--- a/notebooks/end2end_example/cybersecurity/3-build-accelerator-with-finn.ipynb
+++ b/notebooks/end2end_example/cybersecurity/3-build-accelerator-with-finn.ipynb
@@ -78,7 +78,7 @@
     "### Configuring the Board and FPGA Part <a id=\"config_fpga\"></a>\n",
     "\n",
     "* `fpga_part`: Xilinx FPGA part to be used for synthesis, can be left unspecified to be inferred from `board` below, or specified explicitly for e.g. out-of-context synthesis.\n",
-    "* `board`: target Xilinx Zynq or Alveo board for generating accelerators integrated into a shell. See the `pynq_part_map` and `alveo_part_map` dicts in [this file](https://github.com/Xilinx/finn-base/blob/dev/src/finn/util/basic.py#L41) for a list of possible boards.\n",
+    "* `board`: target Xilinx Zynq or Alveo board for generating accelerators integrated into a shell. See the `pynq_part_map` and `alveo_part_map` dicts in [this file](https://github.com/Xilinx/finn/blob/dev/src/finn/util/basic.py#L39) for a list of possible boards.\n",
     "* `shell_flow_type`: the target [shell flow type](https://finn-dev.readthedocs.io/en/latest/source_code/finn.builder.html#finn.builder.build_dataflow_config.ShellFlowType), only needed for generating full bitfiles where the FINN design is integrated into a shell (so only needed if `BITFILE` is selected) \n",
     "\n",
     "### Configuring the Performance <a id=\"config_perf\"></a>\n",

From ec5613c68f209202cf7fefb21d383b0072a2441f Mon Sep 17 00:00:00 2001
From: auphelia <jakobapk@web.de>
Date: Thu, 19 Sep 2024 10:15:08 +0100
Subject: [PATCH 22/51] [InsertFIFO] Preserve onnx tensor dtype when inserting
 FIFOs

---
 src/finn/transformation/fpgadataflow/insert_fifo.py | 13 +++++++++----
 1 file changed, 9 insertions(+), 4 deletions(-)

diff --git a/src/finn/transformation/fpgadataflow/insert_fifo.py b/src/finn/transformation/fpgadataflow/insert_fifo.py
index 9df193efcf..21fb843052 100644
--- a/src/finn/transformation/fpgadataflow/insert_fifo.py
+++ b/src/finn/transformation/fpgadataflow/insert_fifo.py
@@ -29,7 +29,6 @@
 
 import numpy as np
 import warnings
-from onnx import TensorProto
 from onnx import helper as oh
 from qonnx.custom_op.registry import getCustomOp
 from qonnx.transformation.base import Transformation
@@ -114,6 +113,8 @@ def apply(self, model):
                         # determine fifo node attributes
                         fld_shape = n0.get_folded_output_shape()
                         dtype = n0.get_output_datatype()
+                        n0_otensor = model.get_tensor_valueinfo(output_name)
+                        n0_tensor_dtype = n0_otensor.type.tensor_type.elem_type
 
                         # check if folded_shape of output of first node and
                         # input of the second node is equal
@@ -145,7 +146,7 @@ def apply(self, model):
                             # or unless create_shallow_fifos is specified
                             fifo_output_tensor = oh.make_tensor_value_info(
                                 model.make_new_valueinfo_name(),
-                                TensorProto.FLOAT,
+                                n0_tensor_dtype,
                                 n0.get_normal_output_shape(),
                             )
                             graph.value_info.append(fifo_output_tensor)
@@ -196,13 +197,15 @@ def apply(self, model):
                     fld_shape = n0.get_folded_input_shape(inp_ind)
                     n_shape = n0.get_normal_input_shape(inp_ind)
                     dtype = n0.get_input_datatype(inp_ind)
+                    n0_itensor = model.get_tensor_valueinfo(graph_in_name)
+                    n0_tensor_dtype = n0_itensor.type.tensor_type.elem_type
                     fifo_depth = n0.get_nodeattr("inFIFODepths")[inp_ind]
 
                     if fifo_depth > 2 or self.create_shallow_fifos:
                         # create fifo node
                         fifo_output_tensor = oh.make_tensor_value_info(
                             model.make_new_valueinfo_name(),
-                            TensorProto.FLOAT,
+                            n0_tensor_dtype,
                             n0.get_normal_input_shape(inp_ind),
                         )
                         graph.value_info.append(fifo_output_tensor)
@@ -256,13 +259,15 @@ def apply(self, model):
                     fld_shape = n0.get_folded_output_shape(out_ind)
                     n_shape = n0.get_normal_output_shape(out_ind)
                     dtype = n0.get_output_datatype(out_ind)
+                    n0_otensor = model.get_tensor_valueinfo(graph_out_name)
+                    n0_tensor_dtype = n0_otensor.type.tensor_type.elem_type
                     fifo_depth = n0.get_nodeattr("outFIFODepths")[out_ind]
 
                     if fifo_depth > 2 or self.create_shallow_fifos:
                         # create fifo node
                         fifo_input_tensor = oh.make_tensor_value_info(
                             model.make_new_valueinfo_name(),
-                            TensorProto.FLOAT,
+                            n0_tensor_dtype,
                             n0.get_normal_output_shape(),
                         )
                         graph.value_info.append(fifo_input_tensor)

From fb600553d5618d36be334f7dd6c99dea789b0c83 Mon Sep 17 00:00:00 2001
From: auphelia <jakobapk@web.de>
Date: Thu, 19 Sep 2024 10:58:43 +0100
Subject: [PATCH 23/51] [InsertDWC] Preserve onnx tensor dtype when inserting
 DWCs

---
 src/finn/transformation/fpgadataflow/insert_dwc.py | 8 +++++---
 1 file changed, 5 insertions(+), 3 deletions(-)

diff --git a/src/finn/transformation/fpgadataflow/insert_dwc.py b/src/finn/transformation/fpgadataflow/insert_dwc.py
index 33cc3e86d3..b56c8b74ea 100644
--- a/src/finn/transformation/fpgadataflow/insert_dwc.py
+++ b/src/finn/transformation/fpgadataflow/insert_dwc.py
@@ -26,7 +26,6 @@
 # OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
 # OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
 
-from onnx import TensorProto
 from onnx import helper as oh
 from qonnx.custom_op.registry import getCustomOp
 from qonnx.transformation.base import Transformation
@@ -110,12 +109,15 @@ def apply(self, model):
                             # determine shape for dwc
                             dwc_shape = n0.get_normal_output_shape()
 
-                            # determine dtype for dwc
+                            # determine FINN dtype for dwc
                             dtype = n0.get_output_datatype()
+                            # determine onnx tensor dtype for dwc
+                            n0_otensor = model.get_tensor_valueinfo(output_name)
+                            n0_tensor_dtype = n0_otensor.type.tensor_type.elem_type
 
                             dwc_output_tensor = oh.make_tensor_value_info(
                                 model.make_new_valueinfo_name(),
-                                TensorProto.FLOAT,
+                                n0_tensor_dtype,
                                 dwc_shape,
                             )
                             graph.value_info.append(dwc_output_tensor)

From 03830929697464666b58be717ece8328bc6c6965 Mon Sep 17 00:00:00 2001
From: Michal Danilowicz <mdaniowi@amd.com>
Date: Mon, 16 Sep 2024 13:28:15 +0000
Subject: [PATCH 24/51] [Fix] InferDuplicateStreamsLayer now properly handles
 forks of multiple-output nodes

---
 .../fpgadataflow/convert_to_hw_layers.py      | 96 +++++++++----------
 1 file changed, 48 insertions(+), 48 deletions(-)

diff --git a/src/finn/transformation/fpgadataflow/convert_to_hw_layers.py b/src/finn/transformation/fpgadataflow/convert_to_hw_layers.py
index 25a2032aeb..b02bc89db8 100644
--- a/src/finn/transformation/fpgadataflow/convert_to_hw_layers.py
+++ b/src/finn/transformation/fpgadataflow/convert_to_hw_layers.py
@@ -585,63 +585,63 @@ def apply(self, model):
 
         for node in graph.node:
             node_ind += 1
-            successors = model.find_consumers(node.output[0])
-            if successors is not None and len(successors) >= 2:
-                output_tensor = node.output[0]
-                n_outputs = len(successors)
+            for output_tensor in node.output:
+                successors = model.find_consumers(output_tensor)
+                if successors is not None and len(successors) >= 2:
+                    n_outputs = len(successors)
 
-                dt = model.get_tensor_datatype(output_tensor)
+                    dt = model.get_tensor_datatype(output_tensor)
 
-                # skip conversion for layers with float input
-                if not dt.is_integer():
-                    continue
+                    # skip conversion for layers with float input
+                    if not dt.is_integer():
+                        continue
 
-                # create clone tensors
-                out_shape = model.get_tensor_shape(output_tensor)
-                out_tensor_clones = []
-                for i in range(n_outputs):
-                    clone = helper.make_tensor_value_info(
-                        model.make_new_valueinfo_name(), TensorProto.FLOAT, out_shape
-                    )
-                    model.graph.value_info.append(clone)
-                    out_tensor_clones += [clone.name]
+                    # create clone tensors
+                    out_shape = model.get_tensor_shape(output_tensor)
+                    out_tensor_clones = []
+                    for i in range(n_outputs):
+                        clone = helper.make_tensor_value_info(
+                            model.make_new_valueinfo_name(), TensorProto.FLOAT, out_shape
+                        )
+                        model.graph.value_info.append(clone)
+                        out_tensor_clones += [clone.name]
 
-                num_ch = int(out_shape[-1])
-                vecs = out_shape[:-1]
+                    num_ch = int(out_shape[-1])
+                    vecs = out_shape[:-1]
 
-                # create node with no parallelization first
-                pe = 1
+                    # create node with no parallelization first
+                    pe = 1
 
-                dup_node = helper.make_node(
-                    "DuplicateStreams",
-                    [output_tensor],
-                    out_tensor_clones,
-                    domain="finn.custom_op.fpgadataflow",
-                    backend="fpgadataflow",
-                    NumChannels=num_ch,
-                    PE=pe,
-                    inputDataType=dt.name,
-                    numInputVectors=vecs,
-                    NumOutputStreams=n_outputs,
-                    outFIFODepths=[2] * n_outputs,
-                    name="DuplicateStreams_" + node.name,
-                )
+                    dup_node = helper.make_node(
+                        "DuplicateStreams",
+                        [output_tensor],
+                        out_tensor_clones,
+                        domain="finn.custom_op.fpgadataflow",
+                        backend="fpgadataflow",
+                        NumChannels=num_ch,
+                        PE=pe,
+                        inputDataType=dt.name,
+                        numInputVectors=vecs,
+                        NumOutputStreams=n_outputs,
+                        outFIFODepths=[2] * n_outputs,
+                        name="DuplicateStreams_" + node.name,
+                    )
 
-                graph.node.insert(node_ind, dup_node)
+                    graph.node.insert(node_ind, dup_node)
 
-                # connect successors to out tensor clone
-                clone_idx = 0
-                for successor in successors:
-                    for i, succ_input in enumerate(successor.input):
-                        if succ_input == output_tensor:
-                            successor.input[i] = out_tensor_clones[clone_idx]
-                            clone_idx += 1
-                            # if one node has multiple connections to the same output
-                            # find_direct_successors will return one node per input
-                            # so break the inner loop will result in correct behaviour
-                            break
+                    # connect successors to out tensor clone
+                    clone_idx = 0
+                    for successor in successors:
+                        for i, succ_input in enumerate(successor.input):
+                            if succ_input == output_tensor:
+                                successor.input[i] = out_tensor_clones[clone_idx]
+                                clone_idx += 1
+                                # if one node has multiple connections to the same output
+                                # find_direct_successors will return one node per input
+                                # so break the inner loop will result in correct behaviour
+                                break
 
-                graph_modified = True
+                    graph_modified = True
 
         if graph_modified:
             model = model.transform(SortGraph())

From d13aa7e7debb21bd1d75b6dbb6eddc959b4ae8c8 Mon Sep 17 00:00:00 2001
From: Michal Danilowicz <mdaniowi@amd.com>
Date: Mon, 16 Sep 2024 13:48:43 +0000
Subject: [PATCH 25/51] [Fix] MoveScalarLinearPastInvariants, MakeMaxPoolNHWC,
 MakeScaleResizeNHWC transformations are checking whether the node to be moved
 is a fork node, in which case the MoveOpPastFork is called. MoveOpPastFork
 uses deepcopies of the original node.

---
 src/finn/transformation/streamline/reorder.py | 30 ++++++++++++++-----
 1 file changed, 22 insertions(+), 8 deletions(-)

diff --git a/src/finn/transformation/streamline/reorder.py b/src/finn/transformation/streamline/reorder.py
index 8ac2d7dad6..9a7e9d0723 100644
--- a/src/finn/transformation/streamline/reorder.py
+++ b/src/finn/transformation/streamline/reorder.py
@@ -29,6 +29,7 @@
 import numpy as np
 import qonnx.core.data_layout as DataLayout
 import warnings
+from copy import deepcopy
 from onnx import TensorProto
 from onnx import helper as oh
 from qonnx.core.datatype import DataType
@@ -641,6 +642,10 @@ def apply(self, model):
                     # if initializer is not scalar, skip
                     if np.prod(init0.shape) != 1:
                         continue
+                    if model.is_fork_node(prod0):
+                        model = model.transform(MoveOpPastFork(prod0.op_type))
+                        # topology modified, "ask" ModelWrapper to apply this transform again
+                        return (model, True)
                     # Flatten input if required
                     if len(init0.shape) > 0:
                         init0 = init0.flatten()[0]
@@ -713,6 +718,12 @@ def apply(self, model):
                 elif producer is not None and producer.op_type == "Transpose":
                     perms = list(get_by_name(producer.attribute, "perm").ints)
                     if perms == [0, 3, 1, 2]:
+                        # check if the producer is a fork node
+                        # (need to move it past the fork before this transform)
+                        if model.is_fork_node(producer):
+                            model = model.transform(MoveTransposePastFork())
+                            # topology modified, "ask" ModelWrapper to apply this transform again
+                            return (model, True)
                         ceil_mode = get_by_name(n.attribute, "ceil_mode")
                         if ceil_mode is not None:
                             ceil_mode = ceil_mode.i
@@ -764,6 +775,12 @@ def apply(self, model):
                 if producer is not None and producer.op_type == "Transpose":
                     perms = list(get_by_name(producer.attribute, "perm").ints)
                     if perms == [0, 3, 1, 2]:
+                        # check if the producer is a fork node
+                        # (need to move it past the fork before this transform)
+                        if model.is_fork_node(producer):
+                            model = model.transform(MoveTransposePastFork())
+                            # topology modified, "ask" ModelWrapper to apply this transform again
+                            return (model, True)
                         old_value = model.get_initializer(n.input[scales_ind])
                         new_value = np.array(
                             [old_value[idx] for idx in (0, 2, 3, 1)],
@@ -813,10 +830,9 @@ class MoveOpPastFork(Transformation):
     can be merged with nodes in the branches
     """
 
-    def __init__(self, op_name_list, get_attrs_fxn=lambda x: {}):
+    def __init__(self, op_name_list):
         super().__init__()
         self.ops_to_move = op_name_list
-        self.get_attrs_fxn = get_attrs_fxn
 
     def apply(self, model):
         graph = model.graph
@@ -859,11 +875,9 @@ def apply(self, model):
                         new_param_name = model.make_new_valueinfo_name()
                         new_inp_list = [n.input[0], new_param_name]
                         model.set_initializer(new_param_name, op_init_param)
-                    attrs = self.get_attrs_fxn(n)
-                    # TODO use copy of original node instead to get attrs?
-                    new_node = oh.make_node(
-                        n.op_type, new_inp_list, [new_output_tensor_name], **attrs
-                    )
+                    new_node = deepcopy(n)
+                    new_node.input[:] = new_inp_list
+                    new_node.output[:] = [new_output_tensor_name]
                     graph.node.insert(node_ind, new_node)
                     node_ind += 1
 
@@ -901,7 +915,7 @@ def __init__(self):
 
 class MoveTransposePastFork(MoveOpPastFork):
     def __init__(self):
-        super().__init__(["Transpose"], lambda x: {"perm": get_by_name(x.attribute, "perm").ints})
+        super().__init__(["Transpose"])
 
 
 class MoveMaxPoolPastMultiThreshold(Transformation):

From 6223abe86c7d9aee43788825f3c19545dab0ea54 Mon Sep 17 00:00:00 2001
From: Michal Danilowicz <mdaniowi@amd.com>
Date: Mon, 16 Sep 2024 13:59:14 +0000
Subject: [PATCH 26/51] [Fix] InsertFIFO transform is fixed for the case of the
 last node in the graph being a fork node

---
 src/finn/transformation/fpgadataflow/insert_fifo.py | 4 ++--
 1 file changed, 2 insertions(+), 2 deletions(-)

diff --git a/src/finn/transformation/fpgadataflow/insert_fifo.py b/src/finn/transformation/fpgadataflow/insert_fifo.py
index 21fb843052..9ed0f51cd4 100644
--- a/src/finn/transformation/fpgadataflow/insert_fifo.py
+++ b/src/finn/transformation/fpgadataflow/insert_fifo.py
@@ -268,7 +268,7 @@ def apply(self, model):
                         fifo_input_tensor = oh.make_tensor_value_info(
                             model.make_new_valueinfo_name(),
                             n0_tensor_dtype,
-                            n0.get_normal_output_shape(),
+                            n0.get_normal_output_shape(out_ind),
                         )
                         graph.value_info.append(fifo_input_tensor)
                         model.set_tensor_datatype(fifo_input_tensor.name, dtype)
@@ -294,7 +294,7 @@ def apply(self, model):
                         graph.node.append(fifo_node)
 
                         # set fifo output tensor as new input tensor of second node
-                        final_node.output[0] = fifo_input_tensor.name
+                        final_node.output[out_ind] = fifo_input_tensor.name
                     else:
                         warnings.warn(
                             """Output FIFO for %s has depth %d and won't

From 11d8234fdcfb03c00a700dd3ba82cb88d6da66e9 Mon Sep 17 00:00:00 2001
From: =?UTF-8?q?Thomas=20B=2E=20Preu=C3=9Fer?= <thomas.preusser@xilinx.com>
Date: Wed, 25 Sep 2024 13:27:04 +0100
Subject: [PATCH 27/51] Harden lane width computations against 32-bit numeric
 overflow.

---
 finn-rtllib/mvu/mvu_8sx8u_dsp48.sv | 12 ++++++++----
 1 file changed, 8 insertions(+), 4 deletions(-)

diff --git a/finn-rtllib/mvu/mvu_8sx8u_dsp48.sv b/finn-rtllib/mvu/mvu_8sx8u_dsp48.sv
index 107a00918e..dabb36647e 100644
--- a/finn-rtllib/mvu/mvu_8sx8u_dsp48.sv
+++ b/finn-rtllib/mvu/mvu_8sx8u_dsp48.sv
@@ -72,6 +72,10 @@ module mvu_8sx8u_dsp48 #(
 		return  res;
 	endfunction : init_leave_loads
 
+	function int unsigned sum_width(input int unsigned  n, input int unsigned  w);
+		return	w <= 16? $clog2(1 + n*(2**w - 1)) : w + $clog2(n);
+	endfunction : sum_width
+
 	// Pipeline for last indicator flag
 	logic [1:5] L = '0;
 	always_ff @(posedge clk) begin
@@ -445,7 +449,7 @@ module mvu_8sx8u_dsp48 #(
 		// Stage #4: Cross-SIMD Reduction
 
 		// Count leaves reachable from each node
-		localparam leave_load_t  LEAVE_LOAD = SIMD > 1 ? init_leave_loads() : '{ default: 0}; // SIMD=1 requires no adder tree, so zero-ing out, otherwise init_leave_loads ends up in infinite loop
+		localparam leave_load_t  LEAVE_LOAD = SIMD > 1 ? init_leave_loads() : '{ default: 0 }; // SIMD=1 requires no adder tree, so zero-ing out, otherwise init_leave_loads ends up in infinite loop
 
 		// Range of Cross-lane Contribution Tracked in Hi4
 		/*
@@ -462,7 +466,7 @@ module mvu_8sx8u_dsp48 #(
 		 *   signed value is determined by its lower bound to be at least:
 		 *		1 + $clog2(2^(w-1)+SIMD)
 		 */
-		localparam int unsigned  HI_WIDTH = 1 + $clog2(2**(ACCU_WIDTH-D[1]-1)+SIMD);
+		localparam int unsigned  HI_WIDTH = 1 + ($clog2(SIMD) < ACCU_WIDTH-D[1]? ACCU_WIDTH-D[1] : $clog2(2**(ACCU_WIDTH-D[1]-1)+SIMD));
 
 		uwire signed [ACCU_WIDTH       -1:0]  up4;
 		uwire signed [HI_WIDTH         -1:0]  hi4;
@@ -504,12 +508,12 @@ module mvu_8sx8u_dsp48 #(
 			// Conclusive low part accumulation
 			if(i >= PE_REM) begin : blkLo
 				// Adder Tree across all SIMD low contributions (all unsigned arithmetic)
-				localparam int unsigned  ROOT_WIDTH = $clog2(1 + SIMD*(2**LO_WIDTH-1));
+				localparam int unsigned  ROOT_WIDTH = sum_width(SIMD, LO_WIDTH);
 				uwire [2*SIMD-2:0][ROOT_WIDTH-1:0]  tree;
 				for(genvar  s = 0; s < SIMD;   s++)  assign  tree[SIMD-1+s] = p3[s][D[i]+:LO_WIDTH];
 				for(genvar  n = 0; n < SIMD-1; n++) begin
 					// Sum truncated to actual maximum bit width at this node
-					localparam int unsigned  NODE_WIDTH = $clog2(1 + LEAVE_LOAD[n]*(2**LO_WIDTH-1));
+					localparam int unsigned  NODE_WIDTH = sum_width(LEAVE_LOAD[n], LO_WIDTH);
 					uwire [NODE_WIDTH-1:0]  s = tree[2*n+1] + tree[2*n+2];
 					assign  tree[n] = s;
 				end

From 945a4a4c7e341b3d5acaa929e51672babe70bc36 Mon Sep 17 00:00:00 2001
From: =?UTF-8?q?Thomas=20B=2E=20Preu=C3=9Fer?= <thomas.preusser@xilinx.com>
Date: Wed, 25 Sep 2024 13:33:04 +0100
Subject: [PATCH 28/51] Adding testbench having two accumulator sized run
 against one another.

---
 finn-rtllib/mvu/tb/mvu_accu_tb.dat | 192 +++++++++++++++++++++++++++++
 finn-rtllib/mvu/tb/mvu_accu_tb.sv  | 162 ++++++++++++++++++++++++
 2 files changed, 354 insertions(+)
 create mode 100644 finn-rtllib/mvu/tb/mvu_accu_tb.dat
 create mode 100644 finn-rtllib/mvu/tb/mvu_accu_tb.sv

diff --git a/finn-rtllib/mvu/tb/mvu_accu_tb.dat b/finn-rtllib/mvu/tb/mvu_accu_tb.dat
new file mode 100644
index 0000000000..7e102ab6ab
--- /dev/null
+++ b/finn-rtllib/mvu/tb/mvu_accu_tb.dat
@@ -0,0 +1,192 @@
+9
+4
+d
+9
+2
+a
+d
+7
+9
+7
+b
+4
+4
+7
+0
+0
+c
+9
+9
+1
+9
+0
+a
+0
+5
+5
+7
+7
+2
+6
+7
+9
+0
+0
+9
+7
+7
+c
+7
+9
+7
+1
+2
+0
+f
+7
+1
+7
+f
+7
+1
+7
+1
+6
+6
+9
+e
+f
+e
+a
+6
+1
+7
+9
+d
+a
+7
+7
+f
+4
+7
+f
+9
+f
+9
+1
+9
+f
+7
+3
+4
+1
+1
+0
+d
+c
+d
+b
+9
+9
+f
+7
+0
+5
+e
+6
+7
+e
+7
+1
+7
+0
+e
+3
+c
+4
+9
+7
+9
+9
+d
+e
+c
+1
+f
+7
+0
+7
+1
+7
+d
+0
+7
+e
+a
+1
+9
+4
+b
+7
+9
+0
+a
+e
+6
+7
+2
+9
+0
+9
+0
+9
+1
+9
+0
+0
+7
+2
+7
+1
+5
+9
+1
+9
+6
+7
+c
+1
+9
+d
+9
+f
+c
+9
+9
+9
+b
+b
+9
+f
+9
+5
+1
+3
+0
+9
+0
+9
+2
+a
+9
+0
+f
+0
+7
+0
+a
+7
+3
+e
+5
+7
diff --git a/finn-rtllib/mvu/tb/mvu_accu_tb.sv b/finn-rtllib/mvu/tb/mvu_accu_tb.sv
new file mode 100644
index 0000000000..ceeb31194c
--- /dev/null
+++ b/finn-rtllib/mvu/tb/mvu_accu_tb.sv
@@ -0,0 +1,162 @@
+/******************************************************************************
+ * Copyright (C) 2024, Advanced Micro Devices, Inc.
+ * All rights reserved.
+ *
+ * Redistribution and use in source and binary forms, with or without
+ * modification, are permitted provided that the following conditions are met:
+ *
+ *  1. Redistributions of source code must retain the above copyright notice,
+ *     this list of conditions and the following disclaimer.
+ *
+ *  2. Redistributions in binary form must reproduce the above copyright
+ *     notice, this list of conditions and the following disclaimer in the
+ *     documentation and/or other materials provided with the distribution.
+ *
+ *  3. Neither the name of the copyright holder nor the names of its
+ *     contributors may be used to endorse or promote products derived from
+ *     this software without specific prior written permission.
+ *
+ * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
+ * AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO,
+ * THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR
+ * PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDER OR
+ * CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL,
+ * EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO,
+ * PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS;
+ * OR BUSINESS INTERRUPTION). HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY,
+ * WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR
+ * OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF
+ * ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+ *
+ * @brief	Testbench for MVU core compute kernel.
+ *****************************************************************************/
+
+module mvu_accu_tb;
+
+	localparam	IS_MVU = 1;
+	localparam	COMPUTE_CORE = "mvu_8sx8u_dsp48";
+	localparam	PUMPED_COMPUTE = 0;
+	localparam	MW = 6;
+	localparam	MH = 32;
+	localparam	PE = 1;
+	localparam	SIMD = 1;
+	localparam	ACTIVATION_WIDTH = 8;
+	localparam	WEIGHT_WIDTH = 4;
+	localparam	NARROW_WEIGHTS = 1;
+	localparam	SIGNED_ACTIVATIONS = 1;
+	localparam	SEGMENTLEN = 1;
+	localparam	FORCE_BEHAVIORAL = 0;
+
+	// Safely deducible parameters
+	localparam  WEIGHT_STREAM_WIDTH_BA = (PE*SIMD*WEIGHT_WIDTH+7)/8 * 8;
+	localparam  INPUT_STREAM_WIDTH_BA = ((IS_MVU == 1 ? 1 : PE) * SIMD * ACTIVATION_WIDTH + 7) / 8 * 8;
+
+	// Global Control
+	logic  clk = 0;
+	always #5ns clk = !clk;
+	logic  rst = 1;
+	initial begin
+		repeat(16) @(posedge clk);
+		rst <= 0;
+	end
+
+	logic [WEIGHT_WIDTH-1:0]  WeightMem[MH*MW];
+	initial  $readmemh("mvu_accu_tb.dat", WeightMem);
+
+	// Shared Input Feed
+	logic [INPUT_STREAM_WIDTH_BA-1:0]  in_TDATA;
+	logic  in_TVALID[2];
+	uwire  in_TREADY[2];
+	initial begin
+		in_TDATA = 'x;
+		in_TVALID = '{ default: 0 };
+		@(posedge clk iff !rst);
+
+		repeat(2161*MW) begin
+			automatic logic [ACTIVATION_WIDTH-1:0]  a = $urandom();
+			in_TDATA  <= a;
+			in_TVALID <= '{ default: 1 };
+			fork
+				begin
+					@(posedge clk iff in_TREADY[0]);
+					in_TVALID[0] <= 0;
+				end
+				begin
+					@(posedge clk iff in_TREADY[1]);
+					in_TVALID[1] <= 0;
+				end
+			join
+		end
+
+		repeat(MH*MW) @(posedge clk);
+		$display("Test completed.");
+		$finish;
+	end
+
+	// DUTs
+	localparam int unsigned  ACCU_WIDTHS[2] = '{ 16, 32 };
+	int  OutQ[2][$];
+	for(genvar  i = 0; i < $size(ACCU_WIDTHS); i++) begin : genDUTs
+		localparam int unsigned  ACCU_WIDTH = ACCU_WIDTHS[i];
+		localparam int unsigned  OUTPUT_STREAM_WIDTH_BA = (PE*ACCU_WIDTH + 7)/8 * 8;
+
+		// Private Weight Feed
+		logic [WEIGHT_STREAM_WIDTH_BA-1:0]  weights_TDATA;
+		logic  weights_TVALID;
+		uwire  weights_TREADY;
+		initial begin
+			weights_TDATA  = 'x;
+			weights_TVALID = 0;
+			@(posedge clk iff !rst);
+
+			weights_TVALID <= 1;
+			forever begin
+				for(int unsigned  i = 0; i < MH*MW; i++)  begin
+					weights_TDATA <= WeightMem[i];
+					@(posedge clk iff weights_TREADY);
+				end
+			end
+		end
+
+		// Private Output Capture into Queue
+		uwire signed [OUTPUT_STREAM_WIDTH_BA-1:0]  out_TDATA;
+		uwire  out_TVALID;
+		uwire  out_TREADY = !rst;
+		always_ff @(posedge clk iff !rst) begin
+			if(out_TVALID)  OutQ[i].push_back(out_TDATA);
+		end
+
+		// Actual DUT Instance
+		mvu_vvu_axi #(
+			.IS_MVU(IS_MVU), .COMPUTE_CORE(COMPUTE_CORE), .PUMPED_COMPUTE(PUMPED_COMPUTE), .MW(MW), .MH(MH), .PE(PE), .SIMD(SIMD),
+			.ACTIVATION_WIDTH(ACTIVATION_WIDTH), .WEIGHT_WIDTH(WEIGHT_WIDTH), .ACCU_WIDTH(ACCU_WIDTH), .NARROW_WEIGHTS(NARROW_WEIGHTS),
+			.SIGNED_ACTIVATIONS(SIGNED_ACTIVATIONS), .SEGMENTLEN(SEGMENTLEN), .FORCE_BEHAVIORAL(FORCE_BEHAVIORAL)
+		) dut (
+			.ap_clk(clk),
+			.ap_clk2x(1'b0),
+			.ap_rst_n(!rst),
+			.s_axis_weights_tdata(weights_TDATA),
+			.s_axis_weights_tvalid(weights_TVALID),
+			.s_axis_weights_tready(weights_TREADY),
+			.s_axis_input_tdata(in_TDATA),
+			.s_axis_input_tvalid(in_TVALID[i]),
+			.s_axis_input_tready(in_TREADY[i]),
+			.m_axis_output_tdata(out_TDATA),
+			.m_axis_output_tvalid(out_TVALID),
+			.m_axis_output_tready(out_TREADY)
+		);
+	end : genDUTs
+
+	// Output Equivalence Checker
+	always_ff @(posedge clk) begin
+		if(OutQ[0].size && OutQ[1].size) begin
+			automatic int unsigned  y0 = OutQ[0].pop_front();
+			automatic int unsigned  y1 = OutQ[1].pop_front();
+			assert(y0 == y1) else begin
+				$error("Output Mismatch: %0d vs. %0d", y0, y1);
+				$stop;
+			end
+		end
+	end
+
+endmodule : mvu_accu_tb

From 00c3a83aae2a28d75abc097d2655633fc7d55c0d Mon Sep 17 00:00:00 2001
From: auphelia <jakobapk@web.de>
Date: Thu, 26 Sep 2024 11:44:18 +0100
Subject: [PATCH 29/51] [RoundThresh] Clean-up transformation and test files

---
 .../streamline/round_thresholds.py            |  43 +--
 .../streamline/test_round_thresholds.py       | 263 +++++++++---------
 2 files changed, 132 insertions(+), 174 deletions(-)

diff --git a/src/finn/transformation/streamline/round_thresholds.py b/src/finn/transformation/streamline/round_thresholds.py
index 2666242730..ab986e7826 100644
--- a/src/finn/transformation/streamline/round_thresholds.py
+++ b/src/finn/transformation/streamline/round_thresholds.py
@@ -1,4 +1,5 @@
-# Copyright (c) 2020, Xilinx
+# Copyright (c) 2020-2022, Xilinx
+# Copyright (C) 2022-2024, Advanced Micro Devices, Inc.
 # All rights reserved.
 #
 # Redistribution and use in source and binary forms, with or without
@@ -26,22 +27,12 @@
 # OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
 # OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
 
-# Need numpy for modifying the onnx graph tensors, which are numpy style arrays
 import numpy as np
-
-# QONNX wrapper of ONNX model graphs
 from qonnx.core.modelwrapper import ModelWrapper
-
-# QONNX graph transformation base class
 from qonnx.transformation.base import Transformation
-
-# Transformation running qonnx datatype inference
 from qonnx.transformation.infer_datatypes import InferDataTypes
 
 
-# Rounds and clips thresholds to integer values if the node inputs are integer,
-# respecting range, representability and data type (promotion) of the container
-# data type
 class RoundAndClipThresholds(Transformation):
     """For MultiThreshold nodes operating on integer inputs, round up
     thresholds values to the nearest integer. Additionally, if the input
@@ -50,29 +41,19 @@ class RoundAndClipThresholds(Transformation):
     annotation). Runs InferDataTypes() afterward to propagate any changes to the
     quantization data types."""
 
-    # Applies the transform to a whole model graph
     def apply(self, model: ModelWrapper):  # noqa
-        # Get the model graph out of the model wrapper object
         graph = model.graph
-        # Keep track of whether the graph has been modified
         graph_modified = False
-        # Iterate all nodes in the graph keeping track of the index
         for index, node in enumerate(graph.node):
-            # Applies to initializer tensors of MultiThreshold operations
-            if node.op_type == "MultiThreshold":
-                # Try to get the thresholds initializer tensor
+            op_type = node.op_type
+            if op_type == "MultiThreshold":
                 thresholds = model.get_initializer(node.input[1])
-                # There might be no constant thresholds stored as initializer
-                # tensor inside the model
                 if thresholds is None:
-                    # Nothing we can do, skip to the next node
                     continue
-                # Get the data type of the inputs to this operation
                 dtype = model.get_tensor_datatype(node.input[0])
                 # This transformation only applies to thresholding operations
                 # operating on integer inputs
                 if not dtype.is_integer():
-                    # Nothing we can do, skip to the next node
                     continue
                 # Round thresholds up to nearest integer and clip thresholds
                 # outside the input range
@@ -80,24 +61,14 @@ def apply(self, model: ModelWrapper):  # noqa
                 #   introduce extra inaccuracies due to large integers not being
                 #   exactly representable in floating-point representation.
                 #   See for example: np.ceil(np.float32(16777217)) == 16777216
-                # fmt: off
-                new_thresholds = np.clip(
-                    np.ceil(thresholds), dtype.min(), dtype.max()
-                )
-                # fmt: on
+                new_thresholds = np.clip(np.ceil(thresholds), dtype.min(), dtype.max())
                 # Convert back to the preferred float32 container type
-                #   Note: np.clip might have promoted the thresholds to float64
-                #   TODO: Maybe consider an int64 container type for thresholds
-                #    rounded to integer? Need to check all other transformations
-                #    and code generation through the whole FINN and QONNX stack
-                #    first, as these probably assume a float32 container type.
                 new_thresholds = new_thresholds.astype(np.float32)
                 # Insert the rounded and clipped thresholds back into the model
                 model.set_initializer(node.input[1], new_thresholds)
                 # The rounded and clipped thresholds now fit into the input data
                 # type
                 model.set_tensor_datatype(node.input[1], dtype)
-                # Test whether the new thresholds actually differ from the old
                 # ones
                 if np.any(new_thresholds != thresholds):
                     # Track the graph has been modified to inform the transform
@@ -107,9 +78,5 @@ def apply(self, model: ModelWrapper):  # noqa
                     # Immediately exit here to propagate the data type changes
                     # before considering the next node
                     break
-        # Some data types might have changed, do one pass of data type inference
-        # to propagate these changes through the graph
         model = model.transform(InferDataTypes())
-        # Return the transformed model and indicate whether the graph actually
-        # has been transformed to exhaustively apply this transformation again.
         return model, graph_modified
diff --git a/tests/transformation/streamline/test_round_thresholds.py b/tests/transformation/streamline/test_round_thresholds.py
index 63375598a0..7e2d39176e 100644
--- a/tests/transformation/streamline/test_round_thresholds.py
+++ b/tests/transformation/streamline/test_round_thresholds.py
@@ -1,4 +1,5 @@
-# Copyright (c) 2020, Xilinx
+# Copyright (c) 2020-2022, Xilinx, Inc.
+# Copyright (C) 2022-2024, Advanced Micro Devices, Inc.
 # All rights reserved.
 #
 # Redistribution and use in source and binary forms, with or without
@@ -26,32 +27,15 @@
 # OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
 # OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
 
-# fmt: off
-# Disable formatter. This is deliberately formatted to stay within 80 characters
-# per line. Black, however, formats some lines going beyond this.
-
-# Testing framework
 import pytest
 
-# Use numpy for python execution / computing the ground truth expected values
 import numpy as np
-
-# Utility types and function for creating onnx nodes and graphs
 from onnx import TensorProto, helper
-
-# QONNX data types like INT25
 from qonnx.core.datatype import DataType
-
-# QONNX wrapper of ONNX model graphs
 from qonnx.core.modelwrapper import ModelWrapper
-
-# Generate random tensors of QONNX/FINN data types for testing
 from qonnx.util.basic import gen_finn_dt_tensor
 
-# Execution of onnx graphs within FINN
 import finn.core.onnx_exec as oxe
-
-# The transformation to be tested
 from finn.transformation.streamline import RoundAndClipThresholds
 
 
@@ -59,173 +43,186 @@
 # data type combinations with purely integer inputs. Without proper rounding,
 # this tests only the clipping, range and type-casting behavior of the
 # transformation.
-@pytest.mark.parametrize("i_dtype", [
-    # Explanation for selecting these test configurations:
-    # 1. Below 24-bit thresholds we will not observe any interesting rounding
-    #    behavior, as all integers < 2^24 can be exactly represented in 32-bit
-    #    floating-point. Thus, we test thresholds at 25-bit signed integers and
-    #    generate test inputs slightly above and below this.
-    # 2. We want to test out-of-range clipping of thresholds, in particular
-    #    clipping of the negative portion of signed thresholds. Thus, we only
-    #    generate signed thresholds, but test with signed and unsigned
-    #    inputs of smaller, larger and equal range.
-    # 3. Testing proper floating-point thresholds requires a separate test-case
-    "INT23", "UINT23", "INT24", "UINT24", "INT25", "UINT25", "INT26", "UINT26"
-])
-@pytest.mark.parametrize("o_dtype", [
-    # Explanation for selecting these test configurations:
-    # 1. Outputs of MultiThreshold are typically much smaller bit-width than the
-    #    inputs and thresholds.
-    # 2. However, with randomly samples thresholds from a rather large range due
-    #    to the selected input bit-widths (see above), we risk not adequately
-    #    covering the input range if we sample too few thresholds. The number of
-    #    thresholds sampled depends on the bit-width of the output, thus we use
-    #    rather high bit-width for testing.
-    # 3. For a "real" model, the quantization procedure *should* take care of
-    #    adequately covering the true input range.
-    "INT8", "UINT8"
-])
-@pytest.mark.parametrize("n_elems", [
-    # Explanation for selecting these test configurations:
-    # 1. Small edge cases and quickly running through tests: 1, 2, 3, 4
-    # 2. Large test case 256, hopefully amplifying any rarely occurring errors
-    1, 2, 3, 4, 256
-])
+@pytest.mark.parametrize(
+    "i_dtype",
+    [
+        # Explanation for selecting these test configurations:
+        # 1. Below 24-bit thresholds we will not observe any interesting rounding
+        #    behavior, as all integers < 2^24 can be exactly represented in 32-bit
+        #    floating-point. Thus, we test thresholds at 25-bit signed integers and
+        #    generate test inputs slightly above and below this.
+        # 2. We want to test out-of-range clipping of thresholds, in particular
+        #    clipping of the negative portion of signed thresholds. Thus, we only
+        #    generate signed thresholds, but test with signed and unsigned
+        #    inputs of smaller, larger and equal range.
+        # 3. Testing proper floating-point thresholds requires a separate test-case
+        "INT23",
+        "UINT23",
+        "INT24",
+        "UINT24",
+        "INT25",
+        "UINT25",
+        "INT26",
+        "UINT26",
+    ],
+)
+@pytest.mark.parametrize(
+    "o_dtype",
+    [
+        # Explanation for selecting these test configurations:
+        # 1. Outputs of MultiThreshold are typically much smaller bit-width than the
+        #    inputs and thresholds.
+        # 2. However, with randomly samples thresholds from a rather large range due
+        #    to the selected input bit-widths (see above), we risk not adequately
+        #    covering the input range if we sample too few thresholds. The number of
+        #    thresholds sampled depends on the bit-width of the output, thus we use
+        #    rather high bit-width for testing.
+        # 3. For a "real" model, the quantization procedure *should* take care of
+        #    adequately covering the true input range.
+        "INT8",
+        "UINT8",
+    ],
+)
+@pytest.mark.parametrize(
+    "n_elems",
+    [
+        # Explanation for selecting these test configurations:
+        # 1. Small edge cases and quickly running through tests: 1, 2, 3, 4
+        # 2. Large test case 256, hopefully amplifying any rarely occurring errors
+        1,
+        2,
+        3,
+        4,
+        256,
+    ],
+)
 def test_round_and_clip_thresholds_ints(i_dtype, o_dtype, n_elems):
-    # Convert string representation of data type to onnx DataType
     i_dtype = DataType[i_dtype]
     t_dtype = DataType["INT25"]  # Note: Matches configuration above
     o_dtype = DataType[o_dtype]  # noqa: Duplicate model setup code
-    # Create a dummy MultiThreshold operation to be tested
     node = helper.make_node(
-        # Op-Type of the node
         "MultiThreshold",
-        # MultiThreshold is implemented under the qonnx domain
         domain="qonnx.custom_op.general",
-        # List the names of the input tensors
         inputs=["inp", "thresholds"],
-        # List the names of the output tensors
         outputs=["out"],
-        # The CustomOp needs to know the data type of the output to be produced
-        out_dtype=str(o_dtype)
+        out_dtype=str(o_dtype),
     )
-    # Number of threshold values required to produce outputs of type o_dtype
     n_thresholds = o_dtype.get_num_possible_values() - 1
-    # Create tensor value infos for all input/output tensors involved
     inp = helper.make_tensor_value_info("inp", TensorProto.FLOAT, [1, n_elems])
     out = helper.make_tensor_value_info("out", TensorProto.FLOAT, [1, n_elems])
-    # Create a tensor value info for the thresholds parameter tensor
-    #   Note: Number of thresholds is determined by the output data type
     thresholds = helper.make_tensor_value_info(
         "thresholds", TensorProto.FLOAT, [n_elems, n_thresholds]
     )
-    # Combine node and tensor value infos into an onnx graph
     graph = helper.make_graph([node], "thresholds", [inp, thresholds], [out])
-    # Wrap the model graph in a ModelWrapper container
     model = ModelWrapper(helper.make_model(graph))
-    # Sample random tensors of the configured input data type
+
     inp = gen_finn_dt_tensor(i_dtype, [1, n_elems])
-    # Generate sorted thresholds for each of the input channels
     thresholds = np.sort(gen_finn_dt_tensor(t_dtype, [n_elems, n_thresholds]))
-    # Set data type annotations for the input and thresholds tensor
     model.set_tensor_datatype("inp", i_dtype)  # noqa: Duplicate model execution
     model.set_tensor_datatype("thresholds", t_dtype)
     model.set_tensor_datatype("out", o_dtype)
-    # Set the thresholds as initializer input to the model
     model.set_initializer("thresholds", thresholds)
+
     # Execute the model before running the RoundAndClipThresholds transformation
     out_expected = oxe.execute_onnx(model, {"inp": inp})["out"]
-    # Before rounding the threshold data type must be as annotated
     assert model.get_tensor_datatype("thresholds") == t_dtype
-    # Run the transformation to be tested
+
     model = model.transform(RoundAndClipThresholds())
+
     # After this transformation, the thresholds and output data type should be
     # inferred correctly
     assert model.get_tensor_datatype("thresholds") == i_dtype
     assert model.get_tensor_datatype("out") == o_dtype
+
     # After this transformation, the container type used to store the thresholds
     # values must be float32. No other type-cast or type promotion may happen.
     assert model.get_initializer("thresholds").dtype == np.float32
+
     # After rounding, all thresholds must be integers represented as float32
-    assert all(
-        x.is_integer() for x in model.get_initializer("thresholds").flatten()
-    )
+    assert all(x.is_integer() for x in model.get_initializer("thresholds").flatten())
+
     # Execute the model after running the RoundAndClipThresholds transformation
     out_produced = oxe.execute_onnx(model, {"inp": inp})["out"]
-    # Compare the results before and after: This is the pure integer test-case
-    # and no actual rounding should happen, thus the rounded operation should
-    # produce outputs exactly equal.
+
     assert np.all(out_produced == out_expected)
 
 
 # Tests the RoundAndClipThresholds transformation under various input, output
 # data type combinations with purely integer inputs. This test case tests actual
 # rounding of floating-point thresholds.
-@pytest.mark.parametrize("i_dtype", [
-    # Explanation for selecting these test configurations:
-    # 1. Below 24-bit thresholds we will not observe any interesting rounding
-    #    behavior, as all integers < 2^24 can be exactly represented in 32-bit
-    #    floating-point. Thus, we test thresholds at 25-bit signed integers and
-    #    generate test inputs slightly above and below this.
-    # 2. We want to test out-of-range clipping of thresholds, in particular
-    #    clipping of the negative portion of signed thresholds. Thus, we only
-    #    generate signed thresholds, but test with signed and unsigned
-    #    inputs of smaller, larger and equal range.
-    # 3. Testing proper floating-point thresholds requires a separate test-case
-    "INT23", "UINT23", "INT24", "UINT24", "INT25", "UINT25", "INT26", "UINT26"
-])
-@pytest.mark.parametrize("o_dtype", [
-    # Explanation for selecting these test configurations:
-    # 1. Outputs of MultiThreshold are typically much smaller bit-width than the
-    #    inputs and thresholds.
-    # 2. However, with randomly samples thresholds from a rather large range due
-    #    to the selected input bit-widths (see above), we risk not adequately
-    #    covering the input range if we sample too few thresholds. The number of
-    #    thresholds sampled depends on the bit-width of the output, thus we use
-    #    rather high bit-width for testing.
-    # 3. For a "real" model, the quantization procedure *should* take care of
-    #    adequately covering the true input range.
-    "INT8", "UINT8"
-])
-@pytest.mark.parametrize("n_elems", [
-    # Explanation for selecting these test configurations:
-    # 1. Small edge cases and quickly running through tests: 1, 2, 3, 4
-    # 2. Large test case 256, hopefully amplifying any rarely occurring errors
-    1, 2, 3, 4, 256
-])
+@pytest.mark.parametrize(
+    "i_dtype",
+    [
+        # Explanation for selecting these test configurations:
+        # 1. Below 24-bit thresholds we will not observe any interesting rounding
+        #    behavior, as all integers < 2^24 can be exactly represented in 32-bit
+        #    floating-point. Thus, we test thresholds at 25-bit signed integers and
+        #    generate test inputs slightly above and below this.
+        # 2. We want to test out-of-range clipping of thresholds, in particular
+        #    clipping of the negative portion of signed thresholds. Thus, we only
+        #    generate signed thresholds, but test with signed and unsigned
+        #    inputs of smaller, larger and equal range.
+        # 3. Testing proper floating-point thresholds requires a separate test-case
+        "INT23",
+        "UINT23",
+        "INT24",
+        "UINT24",
+        "INT25",
+        "UINT25",
+        "INT26",
+        "UINT26",
+    ],
+)
+@pytest.mark.parametrize(
+    "o_dtype",
+    [
+        # Explanation for selecting these test configurations:
+        # 1. Outputs of MultiThreshold are typically much smaller bit-width than the
+        #    inputs and thresholds.
+        # 2. However, with randomly samples thresholds from a rather large range due
+        #    to the selected input bit-widths (see above), we risk not adequately
+        #    covering the input range if we sample too few thresholds. The number of
+        #    thresholds sampled depends on the bit-width of the output, thus we use
+        #    rather high bit-width for testing.
+        # 3. For a "real" model, the quantization procedure *should* take care of
+        #    adequately covering the true input range.
+        "INT8",
+        "UINT8",
+    ],
+)
+@pytest.mark.parametrize(
+    "n_elems",
+    [
+        # Explanation for selecting these test configurations:
+        # 1. Small edge cases and quickly running through tests: 1, 2, 3, 4
+        # 2. Large test case 256, hopefully amplifying any rarely occurring errors
+        1,
+        2,
+        3,
+        4,
+        256,
+    ],
+)
 def test_round_and_clip_thresholds_floats(i_dtype, o_dtype, n_elems):
-    # Convert string representation of data type to onnx DataType
     i_dtype = DataType[i_dtype]
     t_dtype = DataType["FLOAT32"]
     o_dtype = DataType[o_dtype]  # noqa: Duplicate model setup code
-    # Create a dummy MultiThreshold operation to be tested
     node = helper.make_node(
-        # Op-Type of the node
         "MultiThreshold",
-        # MultiThreshold is implemented under the qonnx domain
         domain="qonnx.custom_op.general",
-        # List the names of the input tensors
         inputs=["inp", "thresholds"],
-        # List the names of the output tensors
         outputs=["out"],
-        # The CustomOp needs to know the data type of the output to be produced
-        out_dtype=str(o_dtype)
+        out_dtype=str(o_dtype),
     )
-    # Number of threshold values required to produce outputs of type o_dtype
     n_thresholds = o_dtype.get_num_possible_values() - 1
-    # Create tensor value infos for all input/output tensors involved
     inp = helper.make_tensor_value_info("inp", TensorProto.FLOAT, [1, n_elems])
     out = helper.make_tensor_value_info("out", TensorProto.FLOAT, [1, n_elems])
-    # Create a tensor value info for the thresholds parameter tensor
-    #   Note: Number of thresholds is determined by the output data type
     thresholds = helper.make_tensor_value_info(
         "thresholds", TensorProto.FLOAT, [n_elems, n_thresholds]
     )
-    # Combine node and tensor value infos into an onnx graph
     graph = helper.make_graph([node], "thresholds", [inp, thresholds], [out])
-    # Wrap the model graph in a ModelWrapper container
     model = ModelWrapper(helper.make_model(graph))
-    # Sample random tensors of the configured input data type
+
     inp = gen_finn_dt_tensor(i_dtype, [1, n_elems])
     # Draw uniformly random prototype thresholds in [0,+1] range
     thresholds = np.random.rand(n_elems, n_thresholds)
@@ -238,30 +235,24 @@ def test_round_and_clip_thresholds_floats(i_dtype, o_dtype, n_elems):
     model.set_tensor_datatype("inp", i_dtype)  # noqa: Duplicate model execution
     model.set_tensor_datatype("thresholds", t_dtype)
     model.set_tensor_datatype("out", o_dtype)
-    # Set the thresholds as initializer input to the model
     model.set_initializer("thresholds", thresholds)
+
     # Execute the model before running the RoundAndClipThresholds transformation
     out_expected = oxe.execute_onnx(model, {"inp": inp})["out"]
     # Before rounding the threshold data type must be as annotated
     assert model.get_tensor_datatype("thresholds") == t_dtype
-    # Run the transformation to be tested
+
     model = model.transform(RoundAndClipThresholds())
-    # After this transformation, the thresholds and output data type should be
-    # inferred correctly
+
     assert model.get_tensor_datatype("thresholds") == i_dtype
     assert model.get_tensor_datatype("out") == o_dtype
+
     # After this transformation, the container type used to store the thresholds
     # values must be float32. No other type-cast or type promotion may happen.
     assert model.get_initializer("thresholds").dtype == np.float32
     # After rounding, all thresholds must be integers represented as float32
-    assert all(
-        x.is_integer() for x in model.get_initializer("thresholds").flatten()
-    )
-    # Execute the model after running the RoundAndClipThresholds transformation
+    assert all(x.is_integer() for x in model.get_initializer("thresholds").flatten())
+
     out_produced = oxe.execute_onnx(model, {"inp": inp})["out"]
-    # Compare the results before and after: This is the floating-point test with
-    # actual rounding, this the transformed result may only be equal within some
-    # tolerance.
-    # Hm, never observed this to be relevant. For all test configurations, exact
-    # equality seems to hold, probably due to only integer inputs being tested.
+
     assert np.allclose(out_produced, out_expected, atol=1.0e-3)

From 717bfc13e2361e767c220a3d298245f04cfd84ef Mon Sep 17 00:00:00 2001
From: auphelia <jakobapk@web.de>
Date: Thu, 26 Sep 2024 12:57:06 +0100
Subject: [PATCH 30/51] [RoundThresh] Expand rounding of thresholds to hw
 layers

---
 src/finn/builder/build_dataflow_steps.py               | 2 ++
 src/finn/transformation/streamline/round_thresholds.py | 2 +-
 tests/end2end/test_end2end_bnn_pynq.py                 | 2 ++
 3 files changed, 5 insertions(+), 1 deletion(-)

diff --git a/src/finn/builder/build_dataflow_steps.py b/src/finn/builder/build_dataflow_steps.py
index bdbcc53d83..ab2280554c 100644
--- a/src/finn/builder/build_dataflow_steps.py
+++ b/src/finn/builder/build_dataflow_steps.py
@@ -121,6 +121,7 @@
 )
 from finn.transformation.streamline import Streamline
 from finn.transformation.streamline.reorder import MakeMaxPoolNHWC
+from finn.transformation.streamline.round_thresholds import RoundAndClipThresholds
 from finn.util.basic import (
     get_rtlsim_trace_depth,
     pyverilate_get_liveness_threshold_cycles,
@@ -503,6 +504,7 @@ def step_minimize_bit_width(model: ModelWrapper, cfg: DataflowBuildConfig):
     if cfg.minimize_bit_width:
         model = model.transform(MinimizeWeightBitWidth())
         model = model.transform(MinimizeAccumulatorWidth())
+        model = model.transform(RoundAndClipThresholds())
         # make sure the changed datatypes are propagated through the network
         model = model.transform(InferDataTypes())
     return model
diff --git a/src/finn/transformation/streamline/round_thresholds.py b/src/finn/transformation/streamline/round_thresholds.py
index ab986e7826..907f127896 100644
--- a/src/finn/transformation/streamline/round_thresholds.py
+++ b/src/finn/transformation/streamline/round_thresholds.py
@@ -46,7 +46,7 @@ def apply(self, model: ModelWrapper):  # noqa
         graph_modified = False
         for index, node in enumerate(graph.node):
             op_type = node.op_type
-            if op_type == "MultiThreshold":
+            if op_type == "MultiThreshold" or op_type.startswith("Thresholding"):
                 thresholds = model.get_initializer(node.input[1])
                 if thresholds is None:
                     continue
diff --git a/tests/end2end/test_end2end_bnn_pynq.py b/tests/end2end/test_end2end_bnn_pynq.py
index 81c6316ec1..0d3418624a 100644
--- a/tests/end2end/test_end2end_bnn_pynq.py
+++ b/tests/end2end/test_end2end_bnn_pynq.py
@@ -94,6 +94,7 @@
     MakeMaxPoolNHWC,
     MoveScalarLinearPastInvariants,
 )
+from finn.transformation.streamline.round_thresholds import RoundAndClipThresholds
 from finn.util.basic import get_finn_root, make_build_dir, test_board_map
 from finn.util.pytorch import ToTensor
 from finn.util.test import (
@@ -672,6 +673,7 @@ def test_minimize_bit_width(self, topology, wbits, abits, board):
         model = load_test_checkpoint_or_skip(prev_chkpt_name)
         model = model.transform(MinimizeAccumulatorWidth())
         model = model.transform(MinimizeWeightBitWidth())
+        model = model.transform(RoundAndClipThresholds())
         curr_chkpt_name = get_checkpoint_name(topology, wbits, abits, "minimize_bit_width")
         model.save(curr_chkpt_name)
 

From 6ade140e684167100cce408454efbd9c2b4008c3 Mon Sep 17 00:00:00 2001
From: auphelia <jakobapk@web.de>
Date: Thu, 26 Sep 2024 14:20:04 +0100
Subject: [PATCH 31/51] [RoundThresh] Add change of the weight datatype to hw
 op threshold rounding

---
 src/finn/transformation/streamline/round_thresholds.py | 5 +++++
 tests/end2end/test_end2end_mobilenet_v1.py             | 1 +
 2 files changed, 6 insertions(+)

diff --git a/src/finn/transformation/streamline/round_thresholds.py b/src/finn/transformation/streamline/round_thresholds.py
index 907f127896..ee6a31e3dc 100644
--- a/src/finn/transformation/streamline/round_thresholds.py
+++ b/src/finn/transformation/streamline/round_thresholds.py
@@ -29,6 +29,7 @@
 
 import numpy as np
 from qonnx.core.modelwrapper import ModelWrapper
+from qonnx.custom_op.registry import getCustomOp
 from qonnx.transformation.base import Transformation
 from qonnx.transformation.infer_datatypes import InferDataTypes
 
@@ -69,6 +70,10 @@ def apply(self, model: ModelWrapper):  # noqa
                 # The rounded and clipped thresholds now fit into the input data
                 # type
                 model.set_tensor_datatype(node.input[1], dtype)
+                # If hw op we need to set the weight data type attribute as well
+                if op_type.startswith("Thresholding"):
+                    inst = getCustomOp(node)
+                    inst.set_nodeattr("weightDataType", dtype.name)
                 # ones
                 if np.any(new_thresholds != thresholds):
                     # Track the graph has been modified to inform the transform
diff --git a/tests/end2end/test_end2end_mobilenet_v1.py b/tests/end2end/test_end2end_mobilenet_v1.py
index 01d995c147..4c52277970 100644
--- a/tests/end2end/test_end2end_mobilenet_v1.py
+++ b/tests/end2end/test_end2end_mobilenet_v1.py
@@ -353,6 +353,7 @@ def test_end2end_mobilenet_minimize_bit_width():
     model = load_test_checkpoint_or_skip(build_dir + "/end2end_mobilenet_folded.onnx")
     model = model.transform(MinimizeAccumulatorWidth())
     model = model.transform(MinimizeWeightBitWidth())
+    model = model.transform(RoundAndClipThresholds())
     model.save(build_dir + "/end2end_mobilenet_minimize_bitwidth.onnx")
 
 

From db353f4fda97df13c593c0a6733e1e3aee9c3ecc Mon Sep 17 00:00:00 2001
From: auphelia <jakobapk@web.de>
Date: Fri, 27 Sep 2024 15:36:10 +0100
Subject: [PATCH 32/51] [RoundThresh] Allow for range + 1

---
 .../streamline/round_thresholds.py              | 17 ++++++++++++-----
 .../test_fpgadataflow_thresholding.py           | 11 +++++++----
 .../streamline/test_round_thresholds.py         | 16 ++++++++++++++--
 3 files changed, 33 insertions(+), 11 deletions(-)

diff --git a/src/finn/transformation/streamline/round_thresholds.py b/src/finn/transformation/streamline/round_thresholds.py
index ee6a31e3dc..312db404ac 100644
--- a/src/finn/transformation/streamline/round_thresholds.py
+++ b/src/finn/transformation/streamline/round_thresholds.py
@@ -28,6 +28,7 @@
 # OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
 
 import numpy as np
+from qonnx.core.datatype import DataType
 from qonnx.core.modelwrapper import ModelWrapper
 from qonnx.custom_op.registry import getCustomOp
 from qonnx.transformation.base import Transformation
@@ -62,18 +63,24 @@ def apply(self, model: ModelWrapper):  # noqa
                 #   introduce extra inaccuracies due to large integers not being
                 #   exactly representable in floating-point representation.
                 #   See for example: np.ceil(np.float32(16777217)) == 16777216
-                new_thresholds = np.clip(np.ceil(thresholds), dtype.min(), dtype.max())
+                new_thresholds = np.clip(np.ceil(thresholds), dtype.min(), dtype.max() + 1)
                 # Convert back to the preferred float32 container type
                 new_thresholds = new_thresholds.astype(np.float32)
                 # Insert the rounded and clipped thresholds back into the model
                 model.set_initializer(node.input[1], new_thresholds)
-                # The rounded and clipped thresholds now fit into the input data
-                # type
-                model.set_tensor_datatype(node.input[1], dtype)
+                # The rounded and clipped thresholds now fit into a data type
+                # that is one bit bigger than the input datatype
+                # Determine new max_value
+                max_val = dtype.max() + 1
+                if not dtype.signed():
+                    tdt = DataType.get_smallest_possible(max_val)
+                else:
+                    tdt = DataType.get_smallest_possible(-(max_val) - 1)
+                model.set_tensor_datatype(node.input[1], tdt)
                 # If hw op we need to set the weight data type attribute as well
                 if op_type.startswith("Thresholding"):
                     inst = getCustomOp(node)
-                    inst.set_nodeattr("weightDataType", dtype.name)
+                    inst.set_nodeattr("weightDataType", tdt.name)
                 # ones
                 if np.any(new_thresholds != thresholds):
                     # Track the graph has been modified to inform the transform
diff --git a/tests/fpgadataflow/test_fpgadataflow_thresholding.py b/tests/fpgadataflow/test_fpgadataflow_thresholding.py
index fe7ba3d9fb..2079fe7fc5 100644
--- a/tests/fpgadataflow/test_fpgadataflow_thresholding.py
+++ b/tests/fpgadataflow/test_fpgadataflow_thresholding.py
@@ -49,6 +49,7 @@
 from finn.transformation.fpgadataflow.prepare_rtlsim import PrepareRTLSim
 from finn.transformation.fpgadataflow.set_exec_mode import SetExecMode
 from finn.transformation.fpgadataflow.specialize_layers import SpecializeLayers
+from finn.transformation.streamline.round_thresholds import RoundAndClipThresholds
 
 test_fpga_part = "xczu3eg-sbva484-1-e"
 target_clk_ns = 5
@@ -133,10 +134,8 @@ def make_single_multithresholding_modelwrapper(
 @pytest.mark.parametrize(
     "idt_tdt_cfg",
     [
-        (DataType["INT8"], DataType["INT8"]),
-        (DataType["INT8"], DataType["INT9"]),
-        (DataType["UINT5"], DataType["UINT5"]),
-        (DataType["UINT5"], DataType["UINT6"]),
+        (DataType["INT8"], DataType["INT25"]),
+        (DataType["UINT5"], DataType["UINT8"]),
     ],
 )
 @pytest.mark.parametrize("fold", [-1, 1, 2])
@@ -145,6 +144,7 @@ def make_single_multithresholding_modelwrapper(
 @pytest.mark.parametrize("impl_style", ["hls", "rtl"])
 @pytest.mark.parametrize("exec_mode", ["cppsim", "rtlsim"])
 @pytest.mark.parametrize("mem_mode", ["internal_embedded", "internal_decoupled"])
+@pytest.mark.parametrize("round_thresh", [True, False])
 @pytest.mark.fpgadataflow
 @pytest.mark.vivado
 @pytest.mark.slow
@@ -159,6 +159,7 @@ def test_fpgadataflow_thresholding(
     impl_style,
     exec_mode,
     mem_mode,
+    round_thresh,
 ):
     # the mem_mode parameter can only be used for the hls thresholding
     # so the test will only be executed once for impl_style=rtl and once skipped
@@ -234,6 +235,8 @@ def test_fpgadataflow_thresholding(
     node = model.get_nodes_by_op_type(model.graph.node[0].op_type)[0]
     inst = getCustomOp(node)
     inst.set_nodeattr("PE", pe)
+    if round_thresh is True:
+        model = model.transform(RoundAndClipThresholds())
     model = model.transform(GiveUniqueNodeNames())
 
     if impl_style == "hls":
diff --git a/tests/transformation/streamline/test_round_thresholds.py b/tests/transformation/streamline/test_round_thresholds.py
index 7e2d39176e..6de82e6750 100644
--- a/tests/transformation/streamline/test_round_thresholds.py
+++ b/tests/transformation/streamline/test_round_thresholds.py
@@ -96,6 +96,7 @@
         256,
     ],
 )
+@pytest.mark.streamline
 def test_round_and_clip_thresholds_ints(i_dtype, o_dtype, n_elems):
     i_dtype = DataType[i_dtype]
     t_dtype = DataType["INT25"]  # Note: Matches configuration above
@@ -106,6 +107,7 @@ def test_round_and_clip_thresholds_ints(i_dtype, o_dtype, n_elems):
         inputs=["inp", "thresholds"],
         outputs=["out"],
         out_dtype=str(o_dtype),
+        out_bias=float(o_dtype.min()),
     )
     n_thresholds = o_dtype.get_num_possible_values() - 1
     inp = helper.make_tensor_value_info("inp", TensorProto.FLOAT, [1, n_elems])
@@ -117,6 +119,7 @@ def test_round_and_clip_thresholds_ints(i_dtype, o_dtype, n_elems):
     model = ModelWrapper(helper.make_model(graph))
 
     inp = gen_finn_dt_tensor(i_dtype, [1, n_elems])
+    inp[0][0] = i_dtype.max()
     thresholds = np.sort(gen_finn_dt_tensor(t_dtype, [n_elems, n_thresholds]))
     model.set_tensor_datatype("inp", i_dtype)  # noqa: Duplicate model execution
     model.set_tensor_datatype("thresholds", t_dtype)
@@ -131,7 +134,11 @@ def test_round_and_clip_thresholds_ints(i_dtype, o_dtype, n_elems):
 
     # After this transformation, the thresholds and output data type should be
     # inferred correctly
-    assert model.get_tensor_datatype("thresholds") == i_dtype
+    if not i_dtype.signed():
+        new_tdt = DataType.get_smallest_possible(i_dtype.max() + 1)
+    else:
+        new_tdt = DataType.get_smallest_possible(-(i_dtype.max() + 1) - 1)
+    assert model.get_tensor_datatype("thresholds") == new_tdt
     assert model.get_tensor_datatype("out") == o_dtype
 
     # After this transformation, the container type used to store the thresholds
@@ -203,6 +210,7 @@ def test_round_and_clip_thresholds_ints(i_dtype, o_dtype, n_elems):
         256,
     ],
 )
+@pytest.mark.streamline
 def test_round_and_clip_thresholds_floats(i_dtype, o_dtype, n_elems):
     i_dtype = DataType[i_dtype]
     t_dtype = DataType["FLOAT32"]
@@ -244,7 +252,11 @@ def test_round_and_clip_thresholds_floats(i_dtype, o_dtype, n_elems):
 
     model = model.transform(RoundAndClipThresholds())
 
-    assert model.get_tensor_datatype("thresholds") == i_dtype
+    if not i_dtype.signed():
+        new_tdt = DataType.get_smallest_possible(i_dtype.max() + 1)
+    else:
+        new_tdt = DataType.get_smallest_possible(-(i_dtype.max() + 1) - 1)
+    assert model.get_tensor_datatype("thresholds") == new_tdt
     assert model.get_tensor_datatype("out") == o_dtype
 
     # After this transformation, the container type used to store the thresholds

From b250047d444dfdc129bd667ce790c9c7982f2b39 Mon Sep 17 00:00:00 2001
From: auphelia <jakobapk@web.de>
Date: Fri, 11 Oct 2024 09:47:01 +0100
Subject: [PATCH 33/51] [tutorial] Update folding config to new custom operator
 structure

---
 tutorials/fpga_flow/folding_config.json | 15 +++++++--------
 1 file changed, 7 insertions(+), 8 deletions(-)

diff --git a/tutorials/fpga_flow/folding_config.json b/tutorials/fpga_flow/folding_config.json
index 642200d02b..bf94f8058d 100644
--- a/tutorials/fpga_flow/folding_config.json
+++ b/tutorials/fpga_flow/folding_config.json
@@ -1,30 +1,29 @@
 {
   "Defaults": {},
-  "Thresholding_Batch_0": {
-    "PE": 49,
-    "ram_style": "block"
+  "Thresholding_rtl_0": {
+    "PE": 49
   },
-  "MatrixVectorActivation_0": {
+  "MVAU_hls_0": {
     "PE": 16,
     "SIMD": 49,
     "ram_style": "block"
   },
-  "MatrixVectorActivation_1": {
+  "MVAU_hls_1": {
     "PE": 8,
     "SIMD": 8,
     "ram_style": "auto"
   },
-  "MatrixVectorActivation_2": {
+  "MVAU_hls_2": {
     "PE": 8,
     "SIMD": 8,
     "ram_style": "auto"
   },
-  "MatrixVectorActivation_3": {
+  "MVAU_hls_3": {
     "PE": 10,
     "SIMD": 8,
     "ram_style": "distributed"
   },
-  "LabelSelect_Batch_0": {
+  "LabelSelect_hls_0": {
     "PE": 1
   }
 }

From b48147e0a6637659a8a7127dd0016edded998ed5 Mon Sep 17 00:00:00 2001
From: auphelia <jakobapk@web.de>
Date: Fri, 11 Oct 2024 10:36:20 +0100
Subject: [PATCH 34/51] [tutorial] Format tutorial README

---
 tutorials/fpga_flow/README.md | 44 ++++++++++++++++++++++-------------
 1 file changed, 28 insertions(+), 16 deletions(-)

diff --git a/tutorials/fpga_flow/README.md b/tutorials/fpga_flow/README.md
index 2aaad0423b..71f2a2a625 100644
--- a/tutorials/fpga_flow/README.md
+++ b/tutorials/fpga_flow/README.md
@@ -25,20 +25,29 @@ This demo was created using Vivado 2022.1.
 Prior to running, insure the following prerequisites have been met:
 - Install FINN and prerequisites.  The [Getting Started](https://finn.readthedocs.io/en/latest/getting_started.html#quickstart) section of the FINN documentation might be helpful for this.
 - Ensure you have the `FINN_XILINX_PATH` and `FINN_XILINX_VERSION` env variables set appropriately for your install.  For example:
-> export FINN_XILINX_PATH=/opt/Xilinx
-> export FINN_XILINX_VERSION=2022.1
+```shell
+export FINN_XILINX_PATH=/opt/Xilinx
+export FINN_XILINX_VERSION=2022.1
+```
+
 - Set the env variable for your `finn` install top directory (where you cloned the FINN compiler repo):
-> export FINN_ROOT=/home/foo/finn
+```shell
+export FINN_ROOT=/home/foo/finn
+```
 
 Then, change to `finn` install directory and invoke the build as follows:
-> cd ${FINN_ROOT}
-> ./run-docker.sh build_custom ${FINN_ROOT}/tutorials/fpga_flow/
+```shell
+cd ${FINN_ROOT}
+./run-docker.sh build_custom ${FINN_ROOT}/tutorials/fpga_flow/
+```
 
 Alternatively, since the tutorials folder is already part of the FINN compiler installation, you can invoke it from within the Docker container:
-> cd ${FINN_ROOT}
-> ./run-docker.sh
-> cd tutorials/fpga_flow
-> python build.py
+```shell
+cd ${FINN_ROOT}
+./run-docker.sh
+cd tutorials/fpga_flow
+python build.py
+```
 
 The build should finish in about 10 minutes, and the FINN docker will close on success.
 
@@ -59,12 +68,14 @@ The build should finish in about 10 minutes, and the FINN docker will close on s
 ### Examine the Stitched IP
 
 Navigate to the stitched IP project directory:
-
-> cd ${FINN_ROOT}/tutorials/fpga_flow/output_tfc_w0a1_fpga/stitched_ip
+```shell
+cd ${FINN_ROOT}/tutorials/fpga_flow/output_tfc_w0a1_fpga/stitched_ip
+```
 
 And, open the project:
-
-> vivado finn_vivado_stitch_proj.xpr
+```shell
+vivado finn_vivado_stitch_proj.xpr
+```
 
 Explore the IPI board design and note the interfaces.
 
@@ -89,9 +100,10 @@ them under `${FINN_ROOT}/tutorials/fpga_flow/output_tfc_w0a1_fpga/sim`. Let's ex
    the FINN compiler. Used for launching the testbench simulation.
 
 You can now launch the simulation as follows:
-
-> cd ${FINN_ROOT}/tutorials/fpga_flow/output_tfc_w0a1_fpga/sim
-> vivado -mode gui -source make_sim_proj.tcl
+```shell
+cd ${FINN_ROOT}/tutorials/fpga_flow/output_tfc_w0a1_fpga/sim
+vivado -mode gui -source make_sim_proj.tcl
+```
 
 The simulation should complete with:
 

From f6acf7075b3af97719edd3705f1268f0d357e0fa Mon Sep 17 00:00:00 2001
From: Alexander Hornburg <alexander.hornburg@amd.com>
Date: Wed, 23 Oct 2024 17:42:26 +0100
Subject: [PATCH 35/51] [Infra] support passing arguments to build_custom flow

---
 run-docker.sh | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/run-docker.sh b/run-docker.sh
index b1fe44eb0c..1358337a37 100755
--- a/run-docker.sh
+++ b/run-docker.sh
@@ -142,7 +142,7 @@ elif [ "$1" = "build_custom" ]; then
   DOCKER_INTERACTIVE="-it"
   #FINN_HOST_BUILD_DIR=$BUILD_DATAFLOW_DIR/build
   gecho "Running build_custom: $BUILD_CUSTOM_DIR/$FLOW_NAME.py"
-  DOCKER_CMD="python -mpdb -cc -cq $FLOW_NAME.py"
+  DOCKER_CMD="python -mpdb -cc -cq $FLOW_NAME.py ${@:4}"
 elif [ -z "$1" ]; then
    gecho "Running container only"
    DOCKER_CMD="bash"

From 1d7636b8f8d841eda4e20b6cbd365b4a7257f24d Mon Sep 17 00:00:00 2001
From: "dependabot[bot]" <49699333+dependabot[bot]@users.noreply.github.com>
Date: Wed, 23 Oct 2024 17:41:32 +0000
Subject: [PATCH 36/51] Bump onnx from 1.13.0 to 1.17.0

Bumps [onnx](https://github.com/onnx/onnx) from 1.13.0 to 1.17.0.
- [Release notes](https://github.com/onnx/onnx/releases)
- [Changelog](https://github.com/onnx/onnx/blob/main/docs/Changelog-ml.md)
- [Commits](https://github.com/onnx/onnx/compare/v1.13.0...v1.17.0)

---
updated-dependencies:
- dependency-name: onnx
  dependency-type: direct:production
...

Signed-off-by: dependabot[bot] <support@github.com>
---
 requirements.txt | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/requirements.txt b/requirements.txt
index d4ca45cb37..85a0ca1175 100644
--- a/requirements.txt
+++ b/requirements.txt
@@ -5,7 +5,7 @@ gspread==3.6.0
 importlib-resources==6.1.0
 ipython==8.12.2
 numpy==1.24.1
-onnx==1.13.0
+onnx==1.17.0
 onnxoptimizer
 onnxruntime==1.16.1
 pre-commit==3.3.2

From 14b68b7efa235089bf7e1d8d40416095bcb23e81 Mon Sep 17 00:00:00 2001
From: auphelia <jakobapk@web.de>
Date: Thu, 24 Oct 2024 14:29:36 +0100
Subject: [PATCH 37/51] [Infra] Add no-cache env var for run docker script

---
 run-docker.sh | 9 ++++++++-
 1 file changed, 8 insertions(+), 1 deletion(-)

diff --git a/run-docker.sh b/run-docker.sh
index 1358337a37..8bf6440d4f 100755
--- a/run-docker.sh
+++ b/run-docker.sh
@@ -102,6 +102,7 @@ SCRIPTPATH=$(dirname "$SCRIPT")
 : ${FINN_SINGULARITY=""}
 : ${FINN_SKIP_XRT_DOWNLOAD=""}
 : ${FINN_XRT_PATH=""}
+: ${FINN_DOCKER_NO_CACHE="0"}
 
 DOCKER_INTERACTIVE=""
 
@@ -190,12 +191,18 @@ if [ -d "$FINN_XRT_PATH" ];then
   export LOCAL_XRT=1
 fi
 
+if [ "$FINN_DOCKER_NO_CACHE" = "1" ]; then
+  export NO_CACHE_STRING="--no-cache"
+else
+  export NO_CACHE_STRING=""
+fi
+
 # Build the FINN Docker image
 if [ "$FINN_DOCKER_PREBUILT" = "0" ] && [ -z "$FINN_SINGULARITY" ]; then
   # Need to ensure this is done within the finn/ root folder:
   OLD_PWD=$(pwd)
   cd $SCRIPTPATH
-  docker build -f docker/Dockerfile.finn --build-arg XRT_DEB_VERSION=$XRT_DEB_VERSION --build-arg SKIP_XRT=$FINN_SKIP_XRT_DOWNLOAD --build-arg LOCAL_XRT=$LOCAL_XRT --tag=$FINN_DOCKER_TAG $FINN_DOCKER_BUILD_EXTRA .
+  docker build -f docker/Dockerfile.finn --build-arg XRT_DEB_VERSION=$XRT_DEB_VERSION --build-arg SKIP_XRT=$FINN_SKIP_XRT_DOWNLOAD --build-arg LOCAL_XRT=$LOCAL_XRT --tag=$FINN_DOCKER_TAG $FINN_DOCKER_BUILD_EXTRA $NO_CACHE_STRING .
   cd $OLD_PWD
 fi
 

From 72dcb87f510436d60ad0c370e6b90692ebf5b213 Mon Sep 17 00:00:00 2001
From: auphelia <jakobapk@web.de>
Date: Thu, 24 Oct 2024 14:41:37 +0100
Subject: [PATCH 38/51] [Infra] Re-use build extra env vars to enable no cache
 option

---
 run-docker.sh | 6 ++----
 1 file changed, 2 insertions(+), 4 deletions(-)

diff --git a/run-docker.sh b/run-docker.sh
index 8bf6440d4f..69c998c467 100755
--- a/run-docker.sh
+++ b/run-docker.sh
@@ -192,9 +192,7 @@ if [ -d "$FINN_XRT_PATH" ];then
 fi
 
 if [ "$FINN_DOCKER_NO_CACHE" = "1" ]; then
-  export NO_CACHE_STRING="--no-cache"
-else
-  export NO_CACHE_STRING=""
+  FINN_DOCKER_BUILD_EXTRA+="--no-cache"
 fi
 
 # Build the FINN Docker image
@@ -202,7 +200,7 @@ if [ "$FINN_DOCKER_PREBUILT" = "0" ] && [ -z "$FINN_SINGULARITY" ]; then
   # Need to ensure this is done within the finn/ root folder:
   OLD_PWD=$(pwd)
   cd $SCRIPTPATH
-  docker build -f docker/Dockerfile.finn --build-arg XRT_DEB_VERSION=$XRT_DEB_VERSION --build-arg SKIP_XRT=$FINN_SKIP_XRT_DOWNLOAD --build-arg LOCAL_XRT=$LOCAL_XRT --tag=$FINN_DOCKER_TAG $FINN_DOCKER_BUILD_EXTRA $NO_CACHE_STRING .
+  docker build -f docker/Dockerfile.finn --build-arg XRT_DEB_VERSION=$XRT_DEB_VERSION --build-arg SKIP_XRT=$FINN_SKIP_XRT_DOWNLOAD --build-arg LOCAL_XRT=$LOCAL_XRT --tag=$FINN_DOCKER_TAG $FINN_DOCKER_BUILD_EXTRA .
   cd $OLD_PWD
 fi
 

From f0aafa261e7a8f57891ba12cd1572e7d3062bc19 Mon Sep 17 00:00:00 2001
From: auphelia <jakobapk@web.de>
Date: Thu, 24 Oct 2024 15:19:55 +0100
Subject: [PATCH 39/51] [Infra] Add space to no cache var to allow for future
 extension

---
 run-docker.sh | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/run-docker.sh b/run-docker.sh
index 69c998c467..b59af88eb7 100755
--- a/run-docker.sh
+++ b/run-docker.sh
@@ -192,7 +192,7 @@ if [ -d "$FINN_XRT_PATH" ];then
 fi
 
 if [ "$FINN_DOCKER_NO_CACHE" = "1" ]; then
-  FINN_DOCKER_BUILD_EXTRA+="--no-cache"
+  FINN_DOCKER_BUILD_EXTRA+="--no-cache "
 fi
 
 # Build the FINN Docker image

From a9f1898deccb74a4f8e38717c5bef00e46c9f70f Mon Sep 17 00:00:00 2001
From: Felix Jentzsch <felix.jentzsch@upb.de>
Date: Fri, 1 Nov 2024 11:35:04 +0000
Subject: [PATCH 40/51] Use Vivado tclstore from install instead of home

---
 run-docker.sh | 3 +++
 1 file changed, 3 insertions(+)

diff --git a/run-docker.sh b/run-docker.sh
index b59af88eb7..ec55299f6c 100755
--- a/run-docker.sh
+++ b/run-docker.sh
@@ -231,6 +231,9 @@ DOCKER_EXEC+="-e NUM_DEFAULT_WORKERS=$NUM_DEFAULT_WORKERS "
 # Workaround for FlexLM issue, see:
 # https://community.flexera.com/t5/InstallAnywhere-Forum/Issues-when-running-Xilinx-tools-or-Other-vendor-tools-in-docker/m-p/245820#M10647
 DOCKER_EXEC+="-e LD_PRELOAD=/lib/x86_64-linux-gnu/libudev.so.1 "
+# Workaround for running multiple Vivado instances simultaneously, see:
+# https://adaptivesupport.amd.com/s/article/63253?language=en_US
+DOCKER_EXEC+="-e XILINX_LOCAL_USER_DATA=no "
 if [ "$FINN_DOCKER_RUN_AS_ROOT" = "0" ] && [ -z "$FINN_SINGULARITY" ];then
   DOCKER_EXEC+="-v /etc/group:/etc/group:ro "
   DOCKER_EXEC+="-v /etc/passwd:/etc/passwd:ro "

From 016c425a44468f419eab97f8e9cde05072a49e26 Mon Sep 17 00:00:00 2001
From: Michal Danilowicz <mdaniowi@amd.com>
Date: Mon, 16 Sep 2024 16:44:54 +0000
Subject: [PATCH 41/51] [Feature] Timeout template added

---
 src/finn/custom_op/fpgadataflow/hlsbackend.py | 14 ++++++
 src/finn/custom_op/fpgadataflow/templates.py  | 45 +++++++++++++++++++
 2 files changed, 59 insertions(+)

diff --git a/src/finn/custom_op/fpgadataflow/hlsbackend.py b/src/finn/custom_op/fpgadataflow/hlsbackend.py
index d8210fd684..c03a9029db 100644
--- a/src/finn/custom_op/fpgadataflow/hlsbackend.py
+++ b/src/finn/custom_op/fpgadataflow/hlsbackend.py
@@ -474,3 +474,17 @@ def get_ap_int_max_w(self):
         ret = max([instream, outstream])
         assert ret <= 8191, "AP_INT_MAX_W=%d is larger than allowed maximum of 8191" % ret
         return ret
+
+    def timeout_value(self):
+        """Set timeout value for HLS functions defined for one clock cycle"""
+        self.code_gen_dict["$TIMEOUT_VALUE$"] = ["100"]
+
+    def timeout_condition(self):
+        """Set timeout condition for HLS functions defined for one clock cycle"""
+        self.code_gen_dict["$TIMEOUT_CONDITION$"] = ["out_{}.empty()".format(self.hls_sname())]
+
+    def timeout_read_stream(self):
+        """Set reading output stream procedure for HLS functions defined for one clock cycle"""
+        self.code_gen_dict["$TIMEOUT_READ_STREAM$"] = [
+            "debug_out_{} << out_{}.read();".format(self.hls_sname(), self.hls_sname())
+        ]
diff --git a/src/finn/custom_op/fpgadataflow/templates.py b/src/finn/custom_op/fpgadataflow/templates.py
index 3d89a0ab23..7ef74118ec 100644
--- a/src/finn/custom_op/fpgadataflow/templates.py
+++ b/src/finn/custom_op/fpgadataflow/templates.py
@@ -58,6 +58,51 @@
 
 """
 
+# template for single node execution with timeout (for single clock hls operations)
+docompute_template_timeout = """
+#define AP_INT_MAX_W $AP_INT_MAX_W$
+#include "cnpy.h"
+#include "npy2apintstream.hpp"
+#include "npy2vectorstream.hpp"
+#include <vector>
+#include "bnn-library.h"
+
+// includes for network parameters
+$GLOBALS$
+
+// defines for network parameters
+$DEFINES$
+
+int main(){
+$PRAGMAS$
+
+$STREAMDECLARATIONS$
+
+$READNPYDATA$
+
+unsigned timeout = 0;
+while(timeout < $TIMEOUT_VALUE$){
+
+$DOCOMPUTE$
+
+if($TIMEOUT_CONDITION$){
+timeout++;
+}
+
+else{
+$TIMEOUT_READ_STREAM$
+timeout = 0;
+}
+}
+
+$DATAOUTSTREAM$
+
+$SAVEASCNPY$
+
+}
+
+"""
+
 # templates for single node ip generation
 
 # cpp file

From fe69308ef293093239d4c8137f80e71375e9bfaf Mon Sep 17 00:00:00 2001
From: mdaniowi <mdaniowi@amd.com>
Date: Fri, 20 Sep 2024 16:02:40 +0100
Subject: [PATCH 42/51] [Feature] npy2vectorstream.hpp include added to
 docompute_template

---
 src/finn/custom_op/fpgadataflow/templates.py | 1 +
 1 file changed, 1 insertion(+)

diff --git a/src/finn/custom_op/fpgadataflow/templates.py b/src/finn/custom_op/fpgadataflow/templates.py
index 7ef74118ec..d2100a7516 100644
--- a/src/finn/custom_op/fpgadataflow/templates.py
+++ b/src/finn/custom_op/fpgadataflow/templates.py
@@ -32,6 +32,7 @@
 #define AP_INT_MAX_W $AP_INT_MAX_W$
 #include "cnpy.h"
 #include "npy2apintstream.hpp"
+#include "npy2vectorstream.hpp"
 #include <vector>
 #include "bnn-library.h"
 

From f21da72ba791ec1a9423f7761b8806843dc417a0 Mon Sep 17 00:00:00 2001
From: auphelia <jakobapk@web.de>
Date: Thu, 5 Dec 2024 14:42:22 +0000
Subject: [PATCH 43/51] First draft of enabling both cpp interfaces in cppsim

---
 src/finn/custom_op/fpgadataflow/hlsbackend.py | 55 +++++++++++++------
 1 file changed, 39 insertions(+), 16 deletions(-)

diff --git a/src/finn/custom_op/fpgadataflow/hlsbackend.py b/src/finn/custom_op/fpgadataflow/hlsbackend.py
index c03a9029db..98b1dc80c9 100644
--- a/src/finn/custom_op/fpgadataflow/hlsbackend.py
+++ b/src/finn/custom_op/fpgadataflow/hlsbackend.py
@@ -54,6 +54,8 @@ def get_nodeattr_types(self):
             "code_gen_dir_cppsim": ("s", False, ""),
             "executable_path": ("s", False, ""),
             "res_hls": ("s", False, ""),
+            # temporary node attribute to keep track of interface style of hls ops
+            "cpp_interface": ("s", False, "packed", {"packed", "hls_vector"}),
         }
 
     def get_all_verilog_paths(self):
@@ -206,7 +208,13 @@ def code_generation_cppsim(self, model):
         self.dataoutstrm()
         self.save_as_npy()
 
-        template = templates.docompute_template
+        if self.get_nodeattr("cpp_interface") == "hls_vector":
+            self.timeout_value()
+            self.timeout_condition()
+            self.timeout_read_stream()
+            template = templates.docompute_template_timeout
+        else:
+            template = templates.docompute_template
 
         for key in self.code_gen_dict:
             # transform list into long string separated by '\n'
@@ -422,27 +430,42 @@ def dataoutstrm(self):
         if dtype == DataType["BIPOLAR"]:
             # use binary for bipolar storage
             dtype = DataType["BINARY"]
-        elem_bits = dtype.bitwidth()
-        packed_bits = self.get_outstream_width()
-        packed_hls_type = "ap_uint<%d>" % packed_bits
         elem_hls_type = dtype.get_hls_datatype_str()
         npy_type = "float"
         npy_out = "%s/output.npy" % code_gen_dir
         oshape = self.get_folded_output_shape()
         oshape_cpp_str = str(oshape).replace("(", "{").replace(")", "}")
 
-        self.code_gen_dict["$DATAOUTSTREAM$"] = [
-            'apintstream2npy<%s, %s, %d, %s>(out_%s, %s, "%s");'
-            % (
-                packed_hls_type,
-                elem_hls_type,
-                elem_bits,
-                npy_type,
-                self.hls_sname(),
-                oshape_cpp_str,
-                npy_out,
-            )
-        ]
+        cpp_interface = self.get_nodeattr("cpp_interface")
+
+        if cpp_interface == "packed":
+            elem_bits = dtype.bitwidth()
+            packed_bits = self.get_outstream_width()
+            packed_hls_type = "ap_uint<%d>" % packed_bits
+
+            self.code_gen_dict["$DATAOUTSTREAM$"] = [
+                'apintstream2npy<%s, %s, %d, %s>(out_%s, %s, "%s");'
+                % (
+                    packed_hls_type,
+                    elem_hls_type,
+                    elem_bits,
+                    npy_type,
+                    self.hls_sname(),
+                    oshape_cpp_str,
+                    npy_out,
+                )
+            ]
+        else:
+            self.code_gen_dict["$DATAOUTSTREAM$"] = [
+                'vectorstream2npy<%s, %s, SIMD>(debug_out_%s, %s, "%s");'
+                % (
+                    elem_hls_type,
+                    npy_type,
+                    self.hls_sname(),
+                    oshape_cpp_str,
+                    npy_out,
+                )
+            ]
 
     def save_as_npy(self):
         """Function to generate the commands for saving data in .npy file in c++"""

From 0b5e80e2c0d41051f3969d49cda2ffae74470df8 Mon Sep 17 00:00:00 2001
From: auphelia <jakobapk@web.de>
Date: Mon, 9 Dec 2024 14:14:54 +0000
Subject: [PATCH 44/51] [HLSBackend] Update hls vector cppsim methods

---
 src/finn/custom_op/fpgadataflow/hlsbackend.py | 49 +++++++++++++------
 1 file changed, 33 insertions(+), 16 deletions(-)

diff --git a/src/finn/custom_op/fpgadataflow/hlsbackend.py b/src/finn/custom_op/fpgadataflow/hlsbackend.py
index 98b1dc80c9..d8397c67fd 100644
--- a/src/finn/custom_op/fpgadataflow/hlsbackend.py
+++ b/src/finn/custom_op/fpgadataflow/hlsbackend.py
@@ -379,24 +379,40 @@ def read_npy_data(self):
         if dtype == DataType["BIPOLAR"]:
             # use binary for bipolar storage
             dtype = DataType["BINARY"]
-        elem_bits = dtype.bitwidth()
-        packed_bits = self.get_instream_width()
-        packed_hls_type = "ap_uint<%d>" % packed_bits
         elem_hls_type = dtype.get_hls_datatype_str()
         npy_type = "float"
         npy_in = "%s/input_0.npy" % code_gen_dir
         self.code_gen_dict["$READNPYDATA$"] = []
-        self.code_gen_dict["$READNPYDATA$"].append(
-            'npy2apintstream<%s, %s, %d, %s>("%s", in0_%s);'
-            % (
-                packed_hls_type,
-                elem_hls_type,
-                elem_bits,
-                npy_type,
-                npy_in,
-                self.hls_sname(),
+
+        cpp_interface = self.get_nodeattr("cpp_interface")
+
+        if cpp_interface == "packed":
+            elem_bits = dtype.bitwidth()
+            packed_bits = self.get_instream_width()
+            packed_hls_type = "ap_uint<%d>" % packed_bits
+            self.code_gen_dict["$READNPYDATA$"].append(
+                'npy2apintstream<%s, %s, %d, %s>("%s", in0_%s);'
+                % (
+                    packed_hls_type,
+                    elem_hls_type,
+                    elem_bits,
+                    npy_type,
+                    npy_in,
+                    self.hls_sname(),
+                )
+            )
+        else:
+            folded_shape = self.get_folded_input_shape()
+            self.code_gen_dict["$READNPYDATA$"].append(
+                'npy2vectorstream<%s, %s, %d>("%s", in0_%s, false);'
+                % (
+                    elem_hls_type,
+                    npy_type,
+                    folded_shape[-1],
+                    npy_in,
+                    self.hls_sname(),
+                )
             )
-        )
 
     def strm_decl(self):
         """Function to generate the commands for the stream declaration in c++,
@@ -456,12 +472,13 @@ def dataoutstrm(self):
                 )
             ]
         else:
+            folded_shape = self.get_folded_output_shape()
             self.code_gen_dict["$DATAOUTSTREAM$"] = [
-                'vectorstream2npy<%s, %s, SIMD>(debug_out_%s, %s, "%s");'
+                'vectorstream2npy<%s, %s, %d>(strm, %s, "%s");'
                 % (
                     elem_hls_type,
                     npy_type,
-                    self.hls_sname(),
+                    folded_shape[-1],
                     oshape_cpp_str,
                     npy_out,
                 )
@@ -509,5 +526,5 @@ def timeout_condition(self):
     def timeout_read_stream(self):
         """Set reading output stream procedure for HLS functions defined for one clock cycle"""
         self.code_gen_dict["$TIMEOUT_READ_STREAM$"] = [
-            "debug_out_{} << out_{}.read();".format(self.hls_sname(), self.hls_sname())
+            "strm << out_{}.read();".format(self.hls_sname())
         ]

From 95bc8a66eb721e8924a820b2b2994792bdbda457 Mon Sep 17 00:00:00 2001
From: auphelia <jakobapk@web.de>
Date: Mon, 9 Dec 2024 14:36:11 +0000
Subject: [PATCH 45/51] [HLSBackend] Increase time out value

---
 src/finn/custom_op/fpgadataflow/hlsbackend.py | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/src/finn/custom_op/fpgadataflow/hlsbackend.py b/src/finn/custom_op/fpgadataflow/hlsbackend.py
index d8397c67fd..4677960ea8 100644
--- a/src/finn/custom_op/fpgadataflow/hlsbackend.py
+++ b/src/finn/custom_op/fpgadataflow/hlsbackend.py
@@ -517,7 +517,7 @@ def get_ap_int_max_w(self):
 
     def timeout_value(self):
         """Set timeout value for HLS functions defined for one clock cycle"""
-        self.code_gen_dict["$TIMEOUT_VALUE$"] = ["100"]
+        self.code_gen_dict["$TIMEOUT_VALUE$"] = ["1000"]
 
     def timeout_condition(self):
         """Set timeout condition for HLS functions defined for one clock cycle"""

From 7d0d3a9592169faf092de75e87985e8598d88334 Mon Sep 17 00:00:00 2001
From: Joshua Monson <joshmonson@gmail.com>
Date: Thu, 12 Dec 2024 00:24:58 +0000
Subject: [PATCH 46/51] switch blocking to non-blocking to blocking assignments
 in combination logic

---
 finn-rtllib/fifo/hdl/Q_srl.v | 144 +++++++++++++++++------------------
 1 file changed, 72 insertions(+), 72 deletions(-)

diff --git a/finn-rtllib/fifo/hdl/Q_srl.v b/finn-rtllib/fifo/hdl/Q_srl.v
index d1ce33c41f..0b01973163 100644
--- a/finn-rtllib/fifo/hdl/Q_srl.v
+++ b/finn-rtllib/fifo/hdl/Q_srl.v
@@ -184,58 +184,58 @@ module Q_srl (clock, reset, i_d, i_v, i_r, o_d, o_v, o_r, count, maxcount);
    end // always @ (posedge clock or negedge reset)
 
    always @* begin					// - combi always
-        srlo_       <=  'bx;
-        shift_en_o_ <= 1'bx;
-        shift_en_   <= 1'bx;
-        addr_       <=  'bx;
-        state_      <= 2'bx;
+        srlo_       =  'bx;
+        shift_en_o_ = 1'bx;
+        shift_en_   = 1'bx;
+        addr_       =  'bx;
+        state_      = 2'bx;
       case (state)
 
 	state_empty: begin		    // - (empty, will not produce)
 	      if (i_v) begin		    // - empty & i_v => consume
-		 srlo_       <= i_d;
-		 shift_en_o_ <= 1;
-		 shift_en_   <= 1'bx;
-		 addr_       <= 0;
-		 state_      <= state_one;
+		 srlo_       = i_d;
+		 shift_en_o_ = 1;
+		 shift_en_   = 1'bx;
+		 addr_       = 0;
+		 state_      = state_one;
 	      end
 	      else	begin		    // - empty & !i_v => idle
-		 srlo_       <= 'bx;
-		 shift_en_o_ <= 0;
-		 shift_en_   <= 1'bx;
-		 addr_       <= 0;
-		 state_      <= state_empty;
+		 srlo_       = 'bx;
+		 shift_en_o_ = 0;
+		 shift_en_   = 1'bx;
+		 addr_       = 0;
+		 state_      = state_empty;
 	      end
 	end
 
 	state_one: begin		    // - (contains one)
 	      if (i_v && o_b) begin	    // - one & i_v & o_b => consume
-		 srlo_       <= 'bx;
-		 shift_en_o_ <= 0;
-		 shift_en_   <= 1;
-		 addr_       <= 0;
-		 state_      <= state_more;
+		 srlo_       = 'bx;
+		 shift_en_o_ = 0;
+		 shift_en_   = 1;
+		 addr_       = 0;
+		 state_      = state_more;
 	      end
 	      else if (i_v && !o_b) begin   // - one & i_v & !o_b => cons+prod
-		 srlo_       <= i_d;
-		 shift_en_o_ <= 1;
-		 shift_en_   <= 1;
-		 addr_       <= 0;
-		 state_      <= state_one;
+		 srlo_       = i_d;
+		 shift_en_o_ = 1;
+		 shift_en_   = 1;
+		 addr_       = 0;
+		 state_      = state_one;
 	      end
 	      else if (!i_v && o_b) begin   // - one & !i_v & o_b => idle
-		 srlo_       <= 'bx;
-		 shift_en_o_ <= 0;
-		 shift_en_   <= 1'bx;
-		 addr_       <= 0;
-		 state_      <= state_one;
+		 srlo_       = 'bx;
+		 shift_en_o_ = 0;
+		 shift_en_   = 1'bx;
+		 addr_       = 0;
+		 state_      = state_one;
 	      end
 	      else if (!i_v && !o_b) begin  // - one & !i_v & !o_b => produce
-		 srlo_       <= 'bx;
-		 shift_en_o_ <= 0;
-		 shift_en_   <= 1'bx;
-		 addr_       <= 0;
-		 state_      <= state_empty;
+		 srlo_       = 'bx;
+		 shift_en_o_ = 0;
+		 shift_en_   = 1'bx;
+		 addr_       = 0;
+		 state_      = state_empty;
 	      end
 	end // case: state_one
 
@@ -244,60 +244,60 @@ module Q_srl (clock, reset, i_d, i_v, i_r, o_d, o_v, o_r, count, maxcount);
 					    // - (full, will not consume)
 					    // - (full here if depth==2)
 	      if (o_b) begin		    // - full & o_b => idle
-		 srlo_       <= 'bx;
-		 shift_en_o_ <= 0;
-		 shift_en_   <= 0;
-		 addr_       <= addr;
-		 state_      <= state_more;
+		 srlo_       = 'bx;
+		 shift_en_o_ = 0;
+		 shift_en_   = 0;
+		 addr_       = addr;
+		 state_      = state_more;
 	      end
 	      else begin		    // - full & !o_b => produce
-		 srlo_       <= srl[addr];
-		 shift_en_o_ <= 1;
-		 shift_en_   <= 0;
-//		 addr_       <= addr-1;
-//		 state_      <= state_more;
-		 addr_       <= addr_zero_ ? 0         : addr-1;
-		 state_      <= addr_zero_ ? state_one : state_more;
+		 srlo_       = srl[addr];
+		 shift_en_o_ = 1;
+		 shift_en_   = 0;
+//		 addr_       = addr-1;
+//		 state_      = state_more;
+		 addr_       = addr_zero_ ? 0         : addr-1;
+		 state_      = addr_zero_ ? state_one : state_more;
 	      end
 	   end
 	   else begin			    // - (mid: neither empty nor full)
 	      if (i_v && o_b) begin	    // - mid & i_v & o_b => consume
-		 srlo_       <= 'bx;
-		 shift_en_o_ <= 0;
-		 shift_en_   <= 1;
-		 addr_       <= addr+1;
-		 state_      <= state_more;
+		 srlo_       = 'bx;
+		 shift_en_o_ = 0;
+		 shift_en_   = 1;
+		 addr_       = addr+1;
+		 state_      = state_more;
 	      end
 	      else if (i_v && !o_b) begin   // - mid & i_v & !o_b => cons+prod
-		 srlo_       <= srl[addr];
-		 shift_en_o_ <= 1;
-		 shift_en_   <= 1;
-		 addr_       <= addr;
-		 state_      <= state_more;
+		 srlo_       = srl[addr];
+		 shift_en_o_ = 1;
+		 shift_en_   = 1;
+		 addr_       = addr;
+		 state_      = state_more;
 	      end
 	      else if (!i_v && o_b) begin   // - mid & !i_v & o_b => idle
-		 srlo_       <= 'bx;
-		 shift_en_o_ <= 0;
-		 shift_en_   <= 0;
-		 addr_       <= addr;
-		 state_      <= state_more;
+		 srlo_       = 'bx;
+		 shift_en_o_ = 0;
+		 shift_en_   = 0;
+		 addr_       = addr;
+		 state_      = state_more;
 	      end
 	      else if (!i_v && !o_b) begin  // - mid & !i_v & !o_b => produce
-		 srlo_       <= srl[addr];
-		 shift_en_o_ <= 1;
-		 shift_en_   <= 0;
-		 addr_       <= addr_zero_ ? 0         : addr-1;
-		 state_      <= addr_zero_ ? state_one : state_more;
+		 srlo_       = srl[addr];
+		 shift_en_o_ = 1;
+		 shift_en_   = 0;
+		 addr_       = addr_zero_ ? 0         : addr-1;
+		 state_      = addr_zero_ ? state_one : state_more;
 	      end
 	   end // else: !if(addr_full)
 	end // case: state_more
 
 	default: begin
-		 srlo_       <=  'bx;
-		 shift_en_o_ <= 1'bx;
-		 shift_en_   <= 1'bx;
-		 addr_       <=  'bx;
-		 state_      <= 2'bx;
+		 srlo_       =  'bx;
+		 shift_en_o_ = 1'bx;
+		 shift_en_   = 1'bx;
+		 addr_       =  'bx;
+		 state_      = 2'bx;
 	end // case: default
 
       endcase // case(state)

From abb96d6fd9edb6699f59a626d2bd4675d0eb17d3 Mon Sep 17 00:00:00 2001
From: Felix Jentzsch <felix.jentzsch@upb.de>
Date: Tue, 7 Jan 2025 17:48:06 +0000
Subject: [PATCH 47/51] Move build dir creation into test

---
 tests/brevitas/test_brevitas_fc.py                     | 3 +--
 tests/transformation/streamline/test_streamline_cnv.py | 3 +--
 tests/transformation/streamline/test_streamline_fc.py  | 3 +--
 3 files changed, 3 insertions(+), 6 deletions(-)

diff --git a/tests/brevitas/test_brevitas_fc.py b/tests/brevitas/test_brevitas_fc.py
index 842d099f57..a7a73a5ed4 100644
--- a/tests/brevitas/test_brevitas_fc.py
+++ b/tests/brevitas/test_brevitas_fc.py
@@ -45,8 +45,6 @@
 from finn.util.basic import make_build_dir
 from finn.util.test import get_test_model_trained
 
-export_onnx_path = make_build_dir("test_brevitas_fc_")
-
 
 @pytest.mark.brevitas_export
 # act bits
@@ -61,6 +59,7 @@ def test_brevitas_fc_onnx_export_and_exec(size, wbits, abits):
     if wbits > abits:
         pytest.skip("No wbits > abits cases at the moment")
     nname = "%s_%dW%dA" % (size, wbits, abits)
+    export_onnx_path = make_build_dir("test_brevitas_fc_")
     finn_onnx = export_onnx_path + "/%s.onnx" % nname
     fc = get_test_model_trained(size, wbits, abits)
     ishape = (1, 1, 28, 28)
diff --git a/tests/transformation/streamline/test_streamline_cnv.py b/tests/transformation/streamline/test_streamline_cnv.py
index 8a91a49278..9e206c843a 100644
--- a/tests/transformation/streamline/test_streamline_cnv.py
+++ b/tests/transformation/streamline/test_streamline_cnv.py
@@ -50,8 +50,6 @@
 from finn.util.basic import make_build_dir
 from finn.util.test import get_test_model_trained
 
-export_onnx_path = make_build_dir("test_streamline_cnv_")
-
 
 @pytest.mark.streamline
 # act bits
@@ -64,6 +62,7 @@ def test_streamline_cnv(size, wbits, abits):
     if wbits > abits:
         pytest.skip("No wbits > abits cases at the moment")
     nname = "%s_%dW%dA" % (size, wbits, abits)
+    export_onnx_path = make_build_dir("test_streamline_cnv_")
     finn_onnx = export_onnx_path + "/%s.onnx" % nname
     fc = get_test_model_trained(size, wbits, abits)
     export_qonnx(fc, torch.randn(1, 3, 32, 32), finn_onnx)
diff --git a/tests/transformation/streamline/test_streamline_fc.py b/tests/transformation/streamline/test_streamline_fc.py
index edc4a96fe2..9ce2f2ab65 100644
--- a/tests/transformation/streamline/test_streamline_fc.py
+++ b/tests/transformation/streamline/test_streamline_fc.py
@@ -52,8 +52,6 @@
 from finn.util.basic import make_build_dir
 from finn.util.test import get_test_model_trained
 
-export_onnx_path = make_build_dir("test_streamline_fc_")
-
 
 @pytest.mark.streamline
 # act bits
@@ -68,6 +66,7 @@ def test_streamline_fc(size, wbits, abits):
     if wbits > abits:
         pytest.skip("No wbits > abits cases at the moment")
     nname = "%s_%dW%dA" % (size, wbits, abits)
+    export_onnx_path = make_build_dir("test_streamline_fc_")
     finn_onnx = export_onnx_path + "/%s.onnx" % nname
     fc = get_test_model_trained(size, wbits, abits)
     export_qonnx(fc, torch.randn(1, 1, 28, 28), finn_onnx)

From 28255c31d649e0d323b98a48a1e266adadecaf5e Mon Sep 17 00:00:00 2001
From: jsmonson <jsmonson@gmail.com>
Date: Fri, 10 Jan 2025 11:30:43 -0700
Subject: [PATCH 48/51] Add V80 to Alveo part_map

---
 src/finn/util/basic.py | 1 +
 1 file changed, 1 insertion(+)

diff --git a/src/finn/util/basic.py b/src/finn/util/basic.py
index 0cb029a888..3f5f3960e4 100644
--- a/src/finn/util/basic.py
+++ b/src/finn/util/basic.py
@@ -69,6 +69,7 @@
 alveo_part_map["U250"] = "xcu250-figd2104-2L-e"
 alveo_part_map["U280"] = "xcu280-fsvh2892-2L-e"
 alveo_part_map["U55C"] = "xcu55c-fsvh2892-2L-e"
+alveo_part_map["V80"]  = "xcv80-lsva4737-2MHP-e-s"
 
 alveo_default_platform = dict()
 alveo_default_platform["U50"] = "xilinx_u50_gen3x16_xdma_5_202210_1"

From d2e89dff4f601e948798f36fc759b49936ebd5c5 Mon Sep 17 00:00:00 2001
From: jsmonson <jsmonson@gmail.com>
Date: Mon, 13 Jan 2025 09:27:11 -0700
Subject: [PATCH 49/51] add V80 similar to other Versal Parts

---
 src/finn/util/basic.py | 3 +--
 1 file changed, 1 insertion(+), 2 deletions(-)

diff --git a/src/finn/util/basic.py b/src/finn/util/basic.py
index 3f5f3960e4..870f9f6fa6 100644
--- a/src/finn/util/basic.py
+++ b/src/finn/util/basic.py
@@ -69,7 +69,6 @@
 alveo_part_map["U250"] = "xcu250-figd2104-2L-e"
 alveo_part_map["U280"] = "xcu280-fsvh2892-2L-e"
 alveo_part_map["U55C"] = "xcu55c-fsvh2892-2L-e"
-alveo_part_map["V80"]  = "xcv80-lsva4737-2MHP-e-s"
 
 alveo_default_platform = dict()
 alveo_default_platform["U50"] = "xilinx_u50_gen3x16_xdma_5_202210_1"
@@ -82,7 +81,7 @@
 part_map = {**pynq_part_map, **alveo_part_map}
 part_map["VEK280"] = "xcve2802-vsvh1760-2MP-e-S"
 part_map["VCK190"] = "xcvc1902-vsva2197-2MP-e-S"
-
+part_map["V80"] = "xcv80-lsva4737-2MHP-e-s"
 
 def get_rtlsim_trace_depth():
     """Return the trace depth for rtlsim via PyVerilator. Controllable

From ba0261fd2d431568917f1ece7f8569da2daf14ec Mon Sep 17 00:00:00 2001
From: jsmonson <jsmonson@gmail.com>
Date: Mon, 13 Jan 2025 09:32:00 -0700
Subject: [PATCH 50/51] add corrected spacing

---
 src/finn/util/basic.py | 1 +
 1 file changed, 1 insertion(+)

diff --git a/src/finn/util/basic.py b/src/finn/util/basic.py
index 870f9f6fa6..5eb72194ea 100644
--- a/src/finn/util/basic.py
+++ b/src/finn/util/basic.py
@@ -83,6 +83,7 @@
 part_map["VCK190"] = "xcvc1902-vsva2197-2MP-e-S"
 part_map["V80"] = "xcv80-lsva4737-2MHP-e-s"
 
+
 def get_rtlsim_trace_depth():
     """Return the trace depth for rtlsim via PyVerilator. Controllable
     via the RTLSIM_TRACE_DEPTH environment variable. If the env.var. is

From 65a83b2f7943219acbf0f5bc427da46034cdadab Mon Sep 17 00:00:00 2001
From: auphelia <jakobapk@web.de>
Date: Tue, 14 Jan 2025 16:32:42 +0000
Subject: [PATCH 51/51] [Builder] Relax requirements to derive fpga part for
 specific board

---
 src/finn/builder/build_dataflow_config.py | 11 +++++------
 tests/fpgadataflow/test_fifosizing.py     |  1 -
 2 files changed, 5 insertions(+), 7 deletions(-)

diff --git a/src/finn/builder/build_dataflow_config.py b/src/finn/builder/build_dataflow_config.py
index 5d69802337..d6437a2e5c 100644
--- a/src/finn/builder/build_dataflow_config.py
+++ b/src/finn/builder/build_dataflow_config.py
@@ -35,7 +35,7 @@
 from typing import Any, List, Optional
 
 from finn.transformation.fpgadataflow.vitis_build import VitisOptStrategy
-from finn.util.basic import alveo_default_platform, alveo_part_map, pynq_part_map
+from finn.util.basic import alveo_default_platform, part_map
 
 
 class AutoFIFOSizingMethod(str, Enum):
@@ -370,11 +370,10 @@ def _resolve_driver_platform(self):
     def _resolve_fpga_part(self):
         if self.fpga_part is None:
             # lookup from part map if not specified
-            if self.shell_flow_type == ShellFlowType.VIVADO_ZYNQ:
-                return pynq_part_map[self.board]
-            elif self.shell_flow_type == ShellFlowType.VITIS_ALVEO:
-                return alveo_part_map[self.board]
-            else:
+            try:
+                fpga_part = part_map[self.board]
+                return fpga_part
+            except KeyError:
                 raise Exception("Couldn't resolve fpga_part for " + self.board)
         else:
             # return as-is when explicitly specified
diff --git a/tests/fpgadataflow/test_fifosizing.py b/tests/fpgadataflow/test_fifosizing.py
index 338204c0c7..e5f9659665 100644
--- a/tests/fpgadataflow/test_fifosizing.py
+++ b/tests/fpgadataflow/test_fifosizing.py
@@ -70,7 +70,6 @@ def test_fifosizing_linear(method, topology):
         synth_clk_period_ns=10.0,
         board="Pynq-Z1",
         rtlsim_batch_size=100 if topology == "tfc" else 2,
-        shell_flow_type=build_cfg.ShellFlowType.VIVADO_ZYNQ,
         generate_outputs=[
             build_cfg.DataflowOutputType.ESTIMATE_REPORTS,
             build_cfg.DataflowOutputType.STITCHED_IP,