From 3208a8912b958b7832635c443e49309ff8f86143 Mon Sep 17 00:00:00 2001
From: nicologhielmetti <nicolo.ghielmetti@gmail.com>
Date: Mon, 9 Dec 2024 21:47:17 +0100
Subject: [PATCH 1/4] =?UTF-8?q?Sample=20`=20FloatQuant`=20function=20imple?=
 =?UTF-8?q?mented.=20A=20sample=20use=20of=20the=20function=20can=20be=20f?=
 =?UTF-8?q?ound=20in=20the=20`Examples`.=20`=C2=B1inf`=20are=20clipped=20t?=
 =?UTF-8?q?o=20`=C2=B1max=5Fval`.=20`=C2=B1NaN`=20are=20mapped=20to=20`?=
 =?UTF-8?q?=C2=B1NaN`.=20The=20zero=20is=20always=20representable.=20I=20t?=
 =?UTF-8?q?ested=20with=20subnormals=20(to=20be=20intended=20as=20subnorma?=
 =?UTF-8?q?ls=20for=20the=20output=20representation)=20and=20the=20quantiz?=
 =?UTF-8?q?er=20represented=20the=20subnormals=20with=20no=20loss=20(I=20d?=
 =?UTF-8?q?idn't=20extensively=20tested=20this=20part=20though).=20I=20tes?=
 =?UTF-8?q?ted=20the=20function=20against=20Brevitas=20`FloatQuant`=20impl?=
 =?UTF-8?q?ementation:=20they=20do=20not=20always=20match.=20For=20example?=
 =?UTF-8?q?=20I=20think=20`0.3125`=20should=20be=20representable=20(`x=20?=
 =?UTF-8?q?=3D=3D=20xq`)=20by=20a=20float=20quantizer=20with=204bits=20for?=
 =?UTF-8?q?=20mantissa,=204bits=20for=20the=20exponent,=200=20bias=20and?=
 =?UTF-8?q?=201bit=20for=20the=20sign.=20Brevitas=20`FloatQuant`=20impleme?=
 =?UTF-8?q?ntation=20quantize=20it=20to=20`0.25`.=20Not=20sure=20what=20I?=
 =?UTF-8?q?=20should=20consider=20correct=20for=20this=20case.?=
MIME-Version: 1.0
Content-Type: text/plain; charset=UTF-8
Content-Transfer-Encoding: 8bit

---
 docs/qonnx-custom-ops/floatquant_op.md | 83 +++++++++++++++++++++++++-
 1 file changed, 81 insertions(+), 2 deletions(-)
diff --git a/docs/qonnx-custom-ops/floatquant_op.md b/docs/qonnx-custom-ops/floatquant_op.md
index ec8b85fd..ebd35c90 100644
--- a/docs/qonnx-custom-ops/floatquant_op.md
+++ b/docs/qonnx-custom-ops/floatquant_op.md
@@ -64,8 +64,87 @@ This operator is not part of the ONNX standard and is not currently versioned.
 </dl>
 
 #### Examples
-TODO
+```python
+def compute_max_val(exponent_bit_width, mantissa_bit_width, exponent_bias):
+    max_exponent = (2. ** exponent_bit_width) - 1. - exponent_bias
+    max_mantissa = np.sum((
+        2. ** np.arange(
+            0,
+            -1. * mantissa_bit_width - 1.,
+            -1.
+            )))
+    max_val = max_mantissa * (2 ** max_exponent)
+    return max_val
+
+import numpy as np
+x = np.random.rand(100).astype(np.float32)
+scale = 1
+exponent_bitwidth = 4
+mantissa_bitwidth = 3
+exponent_bias = 0
+max_val = compute_max_val(exponent_bitwidth, mantissa_bitwidth, exponent_bias)
+rounding_mode = 'ROUND'
+signed = True
+xq = float_quantize(x, scale, exponent_bitwidth, mantissa_bitwidth, exponent_bias, max_val, rounding_mode)
+```
 
 
 #### Sample Implementation
-TODO
+```python
+def float_quantize(X, scale, exponent_bitwidth, mantissa_bitwidth, exponent_bias, max_val, rounding_mode):
+    """Quantize a given floating point array to minifloat format by specifying the desired minifloat quantization"""
+
+    def resolve_rounding_mode(mode_string):
+        """Resolve the rounding mode string to the corresponding numpy functions."""
+        mode_string = mode_string.upper()
+        if mode_string == "ROUND":
+            return np.round
+        elif mode_string == "CEIL":
+            return np.ceil
+        elif mode_string == "FLOOR":
+            return np.floor
+        else:
+            raise ValueError(f"Could not resolve rounding mode called: {mode_string}")
+
+    # copy the sign of the input
+    sign = np.sign(X)
+    # compute the mask of the values equal to 0 - it will always be zero at the output
+    zero_mask = np.where(X == 0)
+    # copy the input in order to not modify it
+    X = X.copy()
+    # set the zeros to 1.0 - but could be any random value
+    X[zero_mask] = 1.0
+    # apply the scale to the input
+    X /= scale
+    # get input exponents from the floats - no need to use eps since the zeros have been already removed
+    e_inp = np.floor(np.log2(np.abs(X)))
+    # compute the max exponent given the exponent bitwidth.
+    # Note: inf/NaN representation is included and it is clipped at the end of this function
+    e_max = np.maximum(2.**(exponent_bitwidth), 1.)
+    # compute exponent range given the max exponent. e_low represent the subnormals of the quantized representation, e_high the infs/NaNs
+    e_low, e_high = -e_max + exponent_bias + 1, e_max + exponent_bias
+    # limit the value of the exponent given the quantization range
+    e_quant = np.clip(e_inp, e_low, e_high)
+    # compute the shift to get the quantized value rounded properly. This part basically quantize the mantissa
+    # (round the mantissa by setting to 0 the bits not beloging to the quantised representation)
+    round_shift = 2.**(e_quant - mantissa_bitwidth)
+    # apply the shift
+    man = X / round_shift
+    # round the mantissa
+    man_quant = resolve_rounding_mode(rounding_mode)(man)
+    # compute the max value of the mantissa (i.e. all the mantissa bits set to 1)
+    man_max = 2.**(mantissa_bitwidth + 1) - 1
+    # if the quantised value is a subnormal, remove 1 from the mantissa (i.e. 1 + 2**m => 2**m)
+    man_max = np.where(e_quant != e_low, man_max, man_max - 1)
+    # make sure the mantissa is in the representable range
+    man_clip = np.clip(man_quant, -man_max, man_max)
+    # go back to float representation
+    qx = man_clip * round_shift
+    # if it's inf or nan, saturates to sign*max_val
+    qx = np.where(e_quant == e_high, sign * max_val, qx)
+    # restore the original zeros
+    qx[zero_mask] = 0.0
+    # unscale the input
+    qx *= scale
+    return qx
+```

From d7d35b284f6ac50c3a4d541079a1de39e67614b6 Mon Sep 17 00:00:00 2001
From: Yaman Umuroglu <maltanar@gmail.com>
Date: Tue, 10 Dec 2024 12:47:17 +0100
Subject: [PATCH 2/4] [FloatQ] copy over float_quantize into custom op
 placeholder

Co-authored-by: Nicolo Ghielmetti <nicolo.ghielmetti@gmail.com>
---
 src/qonnx/custom_op/general/floatquant.py | 95 +++++++++++++++++++++++
 1 file changed, 95 insertions(+)
 create mode 100644 src/qonnx/custom_op/general/floatquant.py

diff --git a/src/qonnx/custom_op/general/floatquant.py b/src/qonnx/custom_op/general/floatquant.py
new file mode 100644
index 00000000..9cddb945
--- /dev/null
+++ b/src/qonnx/custom_op/general/floatquant.py
@@ -0,0 +1,95 @@
+# Copyright (c) 2024 Nicolo Ghielmetti
+# Copyright (c) 2024 Advanced Micro Devices, Inc.
+# All rights reserved.
+#
+# Redistribution and use in source and binary forms, with or without
+# modification, are permitted provided that the following conditions are met:
+#
+# * Redistributions of source code must retain the above copyright notice, this
+#   list of conditions and the following disclaimer.
+#
+# * Redistributions in binary form must reproduce the above copyright notice,
+#   this list of conditions and the following disclaimer in the documentation
+#   and/or other materials provided with the distribution.
+#
+# * Neither the name of qonnx nor the names of its
+#   contributors may be used to endorse or promote products derived from
+#   this software without specific prior written permission.
+#
+# THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
+# AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
+# IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE
+# DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDER OR CONTRIBUTORS BE LIABLE
+# FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
+# DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR
+# SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER
+# CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY,
+# OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
+# OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+
+import numpy as np
+
+from qonnx.custom_op.general.quant import resolve_rounding_mode
+
+
+def compute_default_exponent_bias(exponent_bitwidth):
+    return (2.0 ** (exponent_bitwidth - 1)) - 1
+
+
+def compute_max_val(exponent_bitwidth, mantissa_bitwidth, exponent_bias=None):
+    if exponent_bias is None:
+        exponent_bias = compute_default_exponent_bias(exponent_bitwidth)
+    max_exponent = (2.0**exponent_bitwidth) - 1.0 - exponent_bias
+    max_mantissa = np.sum((2.0 ** np.arange(0, -1.0 * mantissa_bitwidth - 1.0, -1.0)))
+    max_val = max_mantissa * (2**max_exponent)
+    return max_val
+
+
+def float_quantize(X, scale, exponent_bitwidth, mantissa_bitwidth, exponent_bias=None, max_val=None, rounding_mode="ROUND"):
+    """Quantize a given floating point array to minifloat format by specifying the desired minifloat quantization"""
+    if exponent_bias is None:
+        exponent_bias = compute_default_exponent_bias(exponent_bitwidth)
+    if max_val is None:
+        max_val = compute_max_val(exponent_bitwidth, mantissa_bitwidth, exponent_bias)
+    # copy the sign of the input
+    sign = np.sign(X)
+    # compute the mask of the values equal to 0 - it will always be zero at the output
+    zero_mask = np.where(X == 0)
+    # copy the input in order to not modify it
+    X = X.copy()
+    # set the zeros to 1.0 - but could be any random value
+    X[zero_mask] = 1.0
+    # apply the scale to the input
+    X /= scale
+    # get input exponents from the floats - no need to use eps since the zeros have been already removed
+    e_inp = np.floor(np.log2(np.abs(X)))
+    # compute the max exponent given the exponent bitwidth.
+    # Note: inf/NaN representation is included and it is clipped at the end of this function
+    e_max = np.maximum(2.0 ** (exponent_bitwidth), 1.0)
+    # compute exponent range given the max exponent. e_low represent the subnormals of the
+    # quantized representation, e_high the infs/NaNs
+    e_low, e_high = -e_max + exponent_bias + 1, e_max + exponent_bias
+    # limit the value of the exponent given the quantization range
+    e_quant = np.clip(e_inp, e_low, e_high)
+    # compute the shift to get the quantized value rounded properly. This part basically quantize the mantissa
+    # (round the mantissa by setting to 0 the bits not beloging to the quantised representation)
+    round_shift = 2.0 ** (e_quant - mantissa_bitwidth)
+    # apply the shift
+    man = X / round_shift
+    # round the mantissa
+    man_quant = resolve_rounding_mode(rounding_mode)(man)
+    # compute the max value of the mantissa (i.e. all the mantissa bits set to 1)
+    man_max = 2.0 ** (mantissa_bitwidth + 1) - 1
+    # if the quantised value is a subnormal, remove 1 from the mantissa (i.e. 1 + 2**m => 2**m)
+    man_max = np.where(e_quant != e_low, man_max, man_max - 1)
+    # make sure the mantissa is in the representable range
+    man_clip = np.clip(man_quant, -man_max, man_max)
+    # go back to float representation
+    qx = man_clip * round_shift
+    # if it's inf or nan, saturates to sign*max_val
+    qx = np.where(e_quant == e_high, sign * max_val, qx)
+    # restore the original zeros
+    qx[zero_mask] = 0.0
+    # unscale the input
+    qx *= scale
+    return qx

From 0f6633a0ac5e8d37e51f84f5fba98a194fb9508b Mon Sep 17 00:00:00 2001
From: Yaman Umuroglu <maltanar@gmail.com>
Date: Tue, 10 Dec 2024 12:48:52 +0100
Subject: [PATCH 3/4] [Test] add test skeleton for compute_max_val and
 float_quantize

---
 tests/custom_op/test_floatquant.py | 49 ++++++++++++++++++++++++++++++
 1 file changed, 49 insertions(+)
 create mode 100644 tests/custom_op/test_floatquant.py

diff --git a/tests/custom_op/test_floatquant.py b/tests/custom_op/test_floatquant.py
new file mode 100644
index 00000000..91ebdc92
--- /dev/null
+++ b/tests/custom_op/test_floatquant.py
@@ -0,0 +1,49 @@
+# Copyright (c) 2024 Advanced Micro Devices, Inc.
+# All rights reserved.
+#
+# Redistribution and use in source and binary forms, with or without
+# modification, are permitted provided that the following conditions are met:
+#
+# * Redistributions of source code must retain the above copyright notice, this
+#   list of conditions and the following disclaimer.
+#
+# * Redistributions in binary form must reproduce the above copyright notice,
+#   this list of conditions and the following disclaimer in the documentation
+#   and/or other materials provided with the distribution.
+#
+# * Neither the name of qonnx nor the names of its
+#   contributors may be used to endorse or promote products derived from
+#   this software without specific prior written permission.
+#
+# THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
+# AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
+# IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE
+# DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDER OR CONTRIBUTORS BE LIABLE
+# FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
+# DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR
+# SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER
+# CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY,
+# OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
+# OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+
+
+import numpy as np
+
+from qonnx.custom_op.general.floatquant import compute_max_val, float_quantize
+
+
+def test_compute_max_val():
+    # reference max normal values from OCP MX 1.0 standard
+    assert compute_max_val(2, 3) == 7.5  # FP6 E2M3
+    assert compute_max_val(3, 2) == 28.0  # FP6 E3M2
+    assert compute_max_val(2, 1) == 6.0  # FP4 E2M1
+
+
+def test_float_quantize():
+    zero_tensor = np.zeros((2, 2))
+    unit_scale = np.asarray([1.0], dtype=np.float32)
+    assert np.all(float_quantize(zero_tensor, unit_scale, 2, 3) == zero_tensor)
+    testcase_a = np.asarray([1.5], dtype=np.float32)
+    testcase_b = np.asarray([3.25], dtype=np.float32)
+    assert np.all(float_quantize(testcase_a, unit_scale, 2, 3) == testcase_a)
+    assert np.all(float_quantize(testcase_b, unit_scale, 2, 3) == testcase_b)

From 491a3bebb958da47c802f22cb190b73c9be01c7a Mon Sep 17 00:00:00 2001
From: nicologhielmetti <nicolo.ghielmetti@gmail.com>
Date: Thu, 12 Dec 2024 00:36:39 +0100
Subject: [PATCH 4/4] FloatQuant implementation improved to pass the nullifying
 tests Yaman provided. Some other tests have been added

---
 src/qonnx/custom_op/general/floatquant.py | 23 +++++++++++++++++++----
 tests/custom_op/test_floatquant.py        |  8 ++++++++
 2 files changed, 27 insertions(+), 4 deletions(-)

diff --git a/src/qonnx/custom_op/general/floatquant.py b/src/qonnx/custom_op/general/floatquant.py
index 9cddb945..7447392e 100644
--- a/src/qonnx/custom_op/general/floatquant.py
+++ b/src/qonnx/custom_op/general/floatquant.py
@@ -45,7 +45,16 @@ def compute_max_val(exponent_bitwidth, mantissa_bitwidth, exponent_bias=None):
     return max_val
 
 
-def float_quantize(X, scale, exponent_bitwidth, mantissa_bitwidth, exponent_bias=None, max_val=None, rounding_mode="ROUND"):
+def float_quantize(
+    X,
+    scale,
+    exponent_bitwidth,
+    mantissa_bitwidth,
+    exponent_bias=None,
+    max_val=None,
+    rounding_mode="ROUND",
+    lt_subnorm_to_zero=False,
+):
     """Quantize a given floating point array to minifloat format by specifying the desired minifloat quantization"""
     if exponent_bias is None:
         exponent_bias = compute_default_exponent_bias(exponent_bitwidth)
@@ -65,10 +74,10 @@ def float_quantize(X, scale, exponent_bitwidth, mantissa_bitwidth, exponent_bias
     e_inp = np.floor(np.log2(np.abs(X)))
     # compute the max exponent given the exponent bitwidth.
     # Note: inf/NaN representation is included and it is clipped at the end of this function
-    e_max = np.maximum(2.0 ** (exponent_bitwidth), 1.0)
+    e_max = np.maximum(2.0 ** (exponent_bitwidth) - 1, 1.0)
     # compute exponent range given the max exponent. e_low represent the subnormals of the
     # quantized representation, e_high the infs/NaNs
-    e_low, e_high = -e_max + exponent_bias + 1, e_max + exponent_bias
+    e_low, e_high = -e_max + exponent_bias + 1, e_max - exponent_bias
     # limit the value of the exponent given the quantization range
     e_quant = np.clip(e_inp, e_low, e_high)
     # compute the shift to get the quantized value rounded properly. This part basically quantize the mantissa
@@ -80,6 +89,8 @@ def float_quantize(X, scale, exponent_bitwidth, mantissa_bitwidth, exponent_bias
     man_quant = resolve_rounding_mode(rounding_mode)(man)
     # compute the max value of the mantissa (i.e. all the mantissa bits set to 1)
     man_max = 2.0 ** (mantissa_bitwidth + 1) - 1
+    # compute the min value of the mantissa (i.e. one bit at the position indicated by the exponent)
+    man_min = 2.0**-mantissa_bitwidth
     # if the quantised value is a subnormal, remove 1 from the mantissa (i.e. 1 + 2**m => 2**m)
     man_max = np.where(e_quant != e_low, man_max, man_max - 1)
     # make sure the mantissa is in the representable range
@@ -88,7 +99,11 @@ def float_quantize(X, scale, exponent_bitwidth, mantissa_bitwidth, exponent_bias
     qx = man_clip * round_shift
     # if it's inf or nan, saturates to sign*max_val
     qx = np.where(e_quant == e_high, sign * max_val, qx)
-    # restore the original zeros
+    if lt_subnorm_to_zero:
+        # compute the min subnormal as the lower possible exponent x the min mantissa
+        min_subnormal = 2.0 ** (e_low + 1) * man_min
+        # if the value is closer to zero than the minimum subnormal then set it to 0
+        qx = np.where((X <= min_subnormal) & (X >= -min_subnormal), 0.0, qx)  # restore the original zeros
     qx[zero_mask] = 0.0
     # unscale the input
     qx *= scale
diff --git a/tests/custom_op/test_floatquant.py b/tests/custom_op/test_floatquant.py
index 91ebdc92..d4ff17d0 100644
--- a/tests/custom_op/test_floatquant.py
+++ b/tests/custom_op/test_floatquant.py
@@ -45,5 +45,13 @@ def test_float_quantize():
     assert np.all(float_quantize(zero_tensor, unit_scale, 2, 3) == zero_tensor)
     testcase_a = np.asarray([1.5], dtype=np.float32)
     testcase_b = np.asarray([3.25], dtype=np.float32)
+    testcase_c = np.asarray([8.0], dtype=np.float32)
+    testcase_d = np.asarray([28.2], dtype=np.float32)
+    testcase_e = np.asarray([6.1], dtype=np.float32)
+    testcase_f = np.asarray([0.124], dtype=np.float32)
     assert np.all(float_quantize(testcase_a, unit_scale, 2, 3) == testcase_a)
     assert np.all(float_quantize(testcase_b, unit_scale, 2, 3) == testcase_b)
+    assert np.all(float_quantize(testcase_c, unit_scale, 2, 3) == compute_max_val(2, 3))
+    assert np.all(float_quantize(testcase_d, unit_scale, 3, 2) == compute_max_val(3, 2))
+    assert np.all(float_quantize(testcase_e, unit_scale, 2, 1) == compute_max_val(2, 1))
+    assert np.all(float_quantize(testcase_f, unit_scale, 2, 3, lt_subnorm_to_zero=True) == 0.0)