Skip to content

Commit

Permalink
No public description
Browse files Browse the repository at this point in the history
PiperOrigin-RevId: 578963121
Change-Id: I0fd7881b4fb32caca3e4e83dd5480b32a99813b9
  • Loading branch information
lishanok authored and copybara-github committed Nov 2, 2023
1 parent ca55422 commit f55d93f
Show file tree
Hide file tree
Showing 4 changed files with 270 additions and 17 deletions.
52 changes: 36 additions & 16 deletions qkeras/qtools/divide_and_conquer.py
Original file line number Diff line number Diff line change
Expand Up @@ -36,10 +36,11 @@
from qkeras.qtools import qgraph
from qkeras.qtools import qtools_util
from qkeras.qtools import generate_layer_data_type_map
from qkeras.qtools import dnc_layer_cost_ace


class CostMode(enum.Enum):
NAIVE = 1 # cost is computed from theoretical equations.
ACE = 1 # cost is computed from theoretical equations.
PE_AREA = 2 # cost is computed from compute area only.
PE_BW_AREA = 3 # cost is computed from both compute and memory bandwidth.

Expand Down Expand Up @@ -118,6 +119,7 @@ def get_layer_quantizer_bitwidth(
layer_item = self._layer_map[layer]
weight_quantizer = qtools_util.get_val(layer_item, "weight_quantizer")
mac_quantizer = qtools_util.get_val(layer_item, "multiplier")
acc_quantizer = qtools_util.get_val(layer_item, "accumulator")
input_quantizer_list = qtools_util.get_val(
layer_item, "input_quantizer_list")
output_quantizer = qtools_util.get_val(layer_item, "output_quantizer")
Expand All @@ -134,6 +136,9 @@ def get_layer_quantizer_bitwidth(
"mac_bits": (
mac_quantizer.output.bits if mac_quantizer else
input_quantizer_list[0].bits),
"acc_bits": (
acc_quantizer.output.bits if acc_quantizer else
input_quantizer_list[0].bits),
"output_bits": output_quantizer.bits}
else:
# For the "dummy" head and tail nodes in the graph that we inserted at
Expand All @@ -142,6 +147,7 @@ def get_layer_quantizer_bitwidth(
"input_bits": 0,
"weight_bits": 0,
"mac_bits": 0,
"acc_bits": 0,
"output_bits": 0
}

Expand Down Expand Up @@ -247,19 +253,27 @@ def get_per_layer_cost(layer_quantizer_bitwidth, layer_mac_count, layer_shapes,
InElementPerClk, OutElementPerClk, mode):
"""Area per layer, including both PE and memory Bandwidth."""

# TODO(lishanok@): needs a better cost modeling function. For now we simplify
# it to the number of multipliers + interface bitwidth.
assert mode == CostMode.NAIVE, "Only CostMode.NAIVE is supported for now."
# TODO(lishanok@): needs to add modes that support data-driven cost modeling.
assert mode == CostMode.ACE, "Only CostMode.ACE is supported for now."

pe_area = (layer_quantizer_bitwidth["input_bits"] *
layer_quantizer_bitwidth["weight_bits"] * layer_mac_count *
cin_unroll * cout_unroll * kh_unroll * kw_unroll)
# Compute memory is calculated according to ACE metric, translated to gates.
mac_gates = dnc_layer_cost_ace.get_ace_mac_gates(
xbit=layer_quantizer_bitwidth["input_bits"],
wbit=layer_quantizer_bitwidth["weight_bits"],
abit=layer_quantizer_bitwidth["acc_bits"],
regen_params=False)
pe_area = (mac_gates * layer_mac_count * cin_unroll * cout_unroll *
kh_unroll * kw_unroll)

# Memory includes input, output and weight memory, translated to gates.
memory_area = (
InElementPerClk * layer_quantizer_bitwidth["input_bits"]
+ OutElementPerClk * layer_quantizer_bitwidth["output_bits"] +
InElementPerClk * layer_quantizer_bitwidth["input_bits"] *
dnc_layer_cost_ace.MemoryGatesPerBit["Register"] +
OutElementPerClk * layer_quantizer_bitwidth["output_bits"] *
dnc_layer_cost_ace.MemoryGatesPerBit["Register"] +
np.product(layer_shapes["weight_shape"]) *
layer_quantizer_bitwidth["weight_bits"])
layer_quantizer_bitwidth["weight_bits"] *
dnc_layer_cost_ace.MemoryGatesPerBit["ROM"])

return (pe_area + memory_area)

Expand Down Expand Up @@ -323,7 +337,7 @@ def set_best_global_cost_in_paths(
layer_idx: Int. The index value of the current layer's predecessor.
cur_layer_idx: current layer's index value.
layer_quantizer_bitwidth: Dict that contains layer-related quantizer
bitwidth, including mac_bits, input_bits and output_bits.
bitwidth, including acc_bits, mac_bits, input_bits and output_bits.
layer_mac_count: Int. Use the number of multiplication as the operation
count. To include the number of accumulations, we should multiply the
value by 2, assuming accumulation count ~= multiplication count.
Expand Down Expand Up @@ -487,18 +501,19 @@ def multiply_elements_except_none(my_tuple):
input_size = multiply_elements_except_none(layer.input_shape[:-1])
output_size = multiply_elements_except_none(layer.output_shape[:-1])
target_in_throughput = target_out_throughput * input_size / output_size
target_pe_throughput = max(target_out_throughput, target_in_throughput)
else:
target_in_throughput = target_pe_throughput = target_out_throughput
target_in_throughput = target_out_throughput

# Per new design, target_pe_throughput equals to target_out_throughput.
target_pe_throughput = target_out_throughput
return target_in_throughput, target_pe_throughput


def calc_hw_params(graph, target_OutElementPerClk, target_out_throughput,
input_quantizer_bits,
compute_to_memory_max_ratio=4,
memory_to_unroll_max_ratio=4,
mode=CostMode.NAIVE):
mode=CostMode.ACE):
"""Calculate HW params that minimizes total cost.
Args:
Expand All @@ -512,7 +527,7 @@ def calc_hw_params(graph, target_OutElementPerClk, target_out_throughput,
ComputeOutElement and OutElement
memory_to_unroll_max_ratio: Int. Max allowed ratio between
InElementPerClk and CinUnroll
mode: CostMode. The mode to calculate per layer cost. Default is NAIVE.
mode: CostMode. The mode to calculate per layer cost. Default is ACE.
Returns:
best_path: Dict. Stores the best hw param value at each layer and their
Expand Down Expand Up @@ -580,6 +595,7 @@ def calc_hw_params(graph, target_OutElementPerClk, target_out_throughput,
kernel_height = qtools_util.get_layer_info(cur_layer, "kernel_height")
kernel_width = qtools_util.get_layer_info(cur_layer, "kernel_width")
layer_type = qtools_util.get_layer_info(cur_layer, "layer_type")
output_channel_divisors = qtools_util.find_divisors(output_channel)

logging.debug("input_channel: %d, output_channel: %d, kernel_height: %d, "
"kernel_width: %d, weight_quantizer_bits: %d",
Expand Down Expand Up @@ -620,6 +636,10 @@ def calc_hw_params(graph, target_OutElementPerClk, target_out_throughput,
l = OutElementPerClk / ComputeOutElementPerClk
cout_unroll = ComputeOutElementPerClk

# cout_unroll needs to be a divisor of output_channels
if cout_unroll not in output_channel_divisors:
continue

logging.debug(
".........OutElementPerClk / ComputeOutElementPerClk = %.2f,"
"cout_unroll=%.2f", l, cout_unroll)
Expand Down Expand Up @@ -708,7 +728,7 @@ def estimate_model_cost(
target_out_throughput: float = 1.0,
compute_to_memory_max_ratio: int = 4,
memory_to_unroll_max_ratio: int = 4,
mode: CostMode = CostMode.NAIVE):
mode: CostMode = CostMode.ACE):
"""Main function to divide and conquer cost modeling.
Args:
Expand Down
231 changes: 231 additions & 0 deletions qkeras/qtools/dnc_layer_cost_ace.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,231 @@
# Copyright 2019 Google LLC
#
#
# Licensed under the Apache License, Version 2.0 (the "License");
# you may not use this file except in compliance with the License.
# You may obtain a copy of the License at
#
# http://www.apache.org/licenses/LICENSE-2.0
#
# Unless required by applicable law or agreed to in writing, software
# distributed under the License is distributed on an "AS IS" BASIS,
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
# See the License for the specific language governing permissions and
# limitations under the License.
# ==============================================================================

"""divide_and_conquer per layer cost modeling using ACE and data fitting.
For a given layer with its hardware design params, predict its cost
in actual ASIC implementation using ACE metric and actual MAC gates data points.
"""

import io
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
from scipy.optimize import curve_fit


# Rule-of-thumb mapping between bits and gates in memory area estimate.
MemoryGatesPerBit = {
'Register': 10.0,
'SRAM': 1.0,
'ROM': 0.1,
}


# Previously calculated 3D polynomial coefficients with relative MAE<5%.
MAC_POLY3D_PARAMS = np.array([7.70469119, 13.76199652, -92.15756665])


# MAC area data points generated from go/mac_vs_area.
MAC24 = pd.read_csv(io.StringIO('''
283,280,286,313,325,336,356,,
274,290,325,372,401,428,485,,
285,325,388,510,568,614,713,,
308,372,509,750,865,1002,1167,,
336,427,617,1003,1151,1309,,,
356,480,722,1165,,,,,
'''), header=None)

MAC32 = pd.read_csv(io.StringIO('''
391,365,377,410,453,433,458,507,
364,382,418,466,497,521,578,685,
378,418,485,594,659,721,832,1035,
408,466,596,843,1029,1151,1321,1642,
432,521,724,1153,1363,1512,1797,,
457,578,830,1330,1551,1782,2273,,
'''), header=None)

MAC40 = pd.read_csv(io.StringIO('''
458,457,470,500,522,527,551,605,664
457,475,513,561,597,616,670,782,888
470,513,579,699,766,816,928,1150,1358
499,561,699,996,1161,1273,1499,1850,2189
527,612,818,1275,1545,1691,2054,2516,
549,670,927,1496,1798,2035,2490,3294,
'''), header=None)

MAC48 = pd.read_csv(io.StringIO('''
595,550,566,594,659,624,642,694,745
551,566,607,654,727,707,763,881,984
566,607,679,794,871,921,1017,1270,1489
594,655,793,1097,1285,1401,1668,2101,2378
624,711,921,1397,1816,1950,2277,2763,3301
642,762,1015,1669,1974,2264,2718,3631,4415
'''), header=None)


def mac_gates_polynomial_3d(xyz, a, b, c):
"""Using a 3d polynomial function to model MAC area.
This function models the MAC area to be the sum of multipler, accumulator
and a constant shift. Particularly, multiplier area is modeled to be linear
# to input_bits * weight_bits, per ACE rule.
Args:
xyz: tuple includes input, weight and accumulator bits.
a: polynomial coefficient 0.
b: polynomial coefficient 1.
c: polynomial coefficient 2.
Returns:
MAC area predicted by the function.
"""
x, y, z = xyz
return a * x * y + b * z + c


def gen_mac_gate_model(do_plot=False):
"""Generate the polynomial cost model coefficients using given data.
Args:
do_plot: Bool indicates whether plot the raw data and the fitted curve.
Returns:
params: The esitimated params of the polynomical function.
mae_predict: Calculate the mean absolute error of the predictions.
parameter_std_deviation: one standard deviation errors on the parameters,
indicating the uncertainties of the params.
"""
# acc bits, 1st index
abit = np.array([24, 32, 40, 48])
abit = np.repeat(abit, 54)

# weight bits, 2nd index
wbit = np.array([1, 2, 4, 8, 12, 16])
wbit = np.tile(np.repeat(wbit, 9), 4)

# input bits, 3rd index
xbit = np.array([1, 2, 4, 8, 10, 12, 16, 24, 32])
xbit = np.tile(xbit, 24)

# Record all mac area data points associated with each accumulator bitwidth
mac_arrs = []
# Record the start and end index of the mac area data points
# associated with each accumulator bitwidth
mac_arrs_index = {}
# Record index of all valid data points
valid_index = []
start_pos = 0

for (mac_acc, acc_bits) in zip(
[MAC24, MAC32, MAC40, MAC48], [24, 32, 40, 48]):
cur_mac = mac_acc.to_numpy().reshape(-1)
# Filter out nan data points
cur_valid_index = ~np.isnan(cur_mac)
cur_valid_mac = cur_mac[cur_valid_index]
# Record the data length for each accumulator bits
end_pos = start_pos + len(cur_valid_mac)
mac_arrs_index[acc_bits] = (start_pos, end_pos)
# Append mac areas of each accumulator bits to a list
mac_arrs += list(cur_valid_mac)
start_pos = end_pos
valid_index += list(cur_valid_index)

# Filter out invalid data
xbit = xbit[valid_index]
wbit = wbit[valid_index]
abit = abit[valid_index]

# curve fitting for all data points
params, covariance = curve_fit(
mac_gates_polynomial_3d, (xbit, wbit, abit), mac_arrs)

# Compute one standard deviation errors on the parameters.
parameter_std_deviation = np.sqrt(np.diag(covariance))

# Calculate the mean absolute error between prediction and given data.
mac_predict = mac_gates_polynomial_3d((xbit, wbit, abit), *params)
mae = np.mean(np.abs(mac_predict - mac_arrs))
mae_predict = mae / np.mean(mac_arrs)

if do_plot:
# Plot all raw data points
fig = plt.figure(figsize=(10, 10))
ax = fig.add_subplot(111, projection='3d')

ax.scatter(xbit, wbit, mac_arrs, label='Data')

ax.set_xlabel('X_bits')
ax.set_ylabel('W_bits')
ax.set_zlabel('MAC')

plt.title('MAC area data points')
plt.show()

# Generate a mesh grid for plotting.
x_fit = np.linspace(min(xbit), max(xbit), 50)
w_fit = np.linspace(min(wbit), max(wbit), 50)
xmesh, wmesh = np.meshgrid(x_fit, w_fit)

fig = plt.figure(figsize=(16, 16))
index = 1

# Plotting 3D fitting curve for each accumulator bitwidth
for acc_bits in [24, 32, 40, 48]:
ax = fig.add_subplot(2, 2, index, projection='3d')

start_pos = mac_arrs_index[acc_bits][0]
end_pos = mac_arrs_index[acc_bits][1]
ax.scatter(xbit[start_pos:end_pos], wbit[start_pos:end_pos],
mac_arrs[start_pos:end_pos], label='Data')

amesh = np.full(shape=(50, 50), fill_value=acc_bits)
poly_fit = mac_gates_polynomial_3d((xmesh, wmesh, amesh), *params)

ax.plot_surface(
xmesh, wmesh, poly_fit, cmap='viridis', alpha=0.8,
label=f'Fitted Surface | acc_bits={acc_bits}')

ax.set_xlabel('X')
ax.set_ylabel('W')
ax.set_zlabel('MAC')
ax.set_title(f'accumulator bitwidth: {acc_bits}')
index += 1

plt.show()

return params, mae_predict, parameter_std_deviation


def get_ace_mac_gates(xbit, wbit, abit, regen_params=False):
"""Function to estimate MAC area, including 1 multipler and 1 accumulator.
Args:
xbit: int. input bits.
wbit: int. weight bits.
abit: int. accumulator bits.
regen_params: Bool. If True, regenerate the MAC cost model coefficients.
If False, reuse the previously generated model coefficients.
Returns:
Estimated MAC gates.
"""
if regen_params:
mac_params, _, _ = gen_mac_gate_model(do_plot=True)
else:
mac_params = MAC_POLY3D_PARAMS

return mac_gates_polynomial_3d((xbit, wbit, abit), *mac_params)
2 changes: 2 additions & 0 deletions requirements.txt
Original file line number Diff line number Diff line change
@@ -1,6 +1,8 @@
tensorflow>=2.5.0rc0
numpy>=1.16.5
pyparser
pandas>=1.1.0
matplotlib>=3.3.0
scipy>=1.4.1
setuptools>=41.0.0
argparse>=1.4.0
Expand Down
2 changes: 1 addition & 1 deletion tests/qtools_model_test.py
Original file line number Diff line number Diff line change
Expand Up @@ -985,7 +985,7 @@ def test_divide_and_conquer_sequential_conv2d():
target_out_throughput=1.0,
compute_to_memory_max_ratio=1,
memory_to_unroll_max_ratio=1,
mode=divide_and_conquer.CostMode.NAIVE,
mode=divide_and_conquer.CostMode.ACE,
)


Expand Down

0 comments on commit f55d93f

Please sign in to comment.