diff --git a/qkeras/qtools/divide_and_conquer.py b/qkeras/qtools/divide_and_conquer.py
index d27c0baf..41a37827 100644
--- a/qkeras/qtools/divide_and_conquer.py
+++ b/qkeras/qtools/divide_and_conquer.py
@@ -36,10 +36,11 @@
 from qkeras.qtools import qgraph
 from qkeras.qtools import qtools_util
 from qkeras.qtools import generate_layer_data_type_map
+from qkeras.qtools import dnc_layer_cost_ace
 
 
 class CostMode(enum.Enum):
-  NAIVE = 1  # cost is computed from theoretical equations.
+  ACE = 1  # cost is computed from theoretical equations.
   PE_AREA = 2  # cost is computed from compute area only.
   PE_BW_AREA = 3  # cost is computed from both compute and memory bandwidth.
 
@@ -118,6 +119,7 @@ def get_layer_quantizer_bitwidth(
       layer_item = self._layer_map[layer]
       weight_quantizer = qtools_util.get_val(layer_item, "weight_quantizer")
       mac_quantizer = qtools_util.get_val(layer_item, "multiplier")
+      acc_quantizer = qtools_util.get_val(layer_item, "accumulator")
       input_quantizer_list = qtools_util.get_val(
           layer_item, "input_quantizer_list")
       output_quantizer = qtools_util.get_val(layer_item, "output_quantizer")
@@ -134,6 +136,9 @@ def get_layer_quantizer_bitwidth(
           "mac_bits": (
               mac_quantizer.output.bits if mac_quantizer else
               input_quantizer_list[0].bits),
+          "acc_bits": (
+              acc_quantizer.output.bits if acc_quantizer else
+              input_quantizer_list[0].bits),
           "output_bits": output_quantizer.bits}
     else:
       # For the "dummy" head and tail nodes in the graph that we inserted at
@@ -142,6 +147,7 @@ def get_layer_quantizer_bitwidth(
           "input_bits": 0,
           "weight_bits": 0,
           "mac_bits": 0,
+          "acc_bits": 0,
           "output_bits": 0
       }
 
@@ -247,19 +253,27 @@ def get_per_layer_cost(layer_quantizer_bitwidth, layer_mac_count, layer_shapes,
                        InElementPerClk, OutElementPerClk, mode):
   """Area per layer, including both PE and memory Bandwidth."""
 
-  # TODO(lishanok@): needs a better cost modeling function. For now we simplify
-  # it to the number of multipliers + interface bitwidth.
-  assert mode == CostMode.NAIVE, "Only CostMode.NAIVE is supported for now."
+  # TODO(lishanok@): needs to add modes that support data-driven cost modeling.
+  assert mode == CostMode.ACE, "Only CostMode.ACE is supported for now."
 
-  pe_area = (layer_quantizer_bitwidth["input_bits"] *
-             layer_quantizer_bitwidth["weight_bits"] * layer_mac_count *
-             cin_unroll * cout_unroll * kh_unroll * kw_unroll)
+  # Compute memory is calculated according to ACE metric, translated to gates.
+  mac_gates = dnc_layer_cost_ace.get_ace_mac_gates(
+      xbit=layer_quantizer_bitwidth["input_bits"],
+      wbit=layer_quantizer_bitwidth["weight_bits"],
+      abit=layer_quantizer_bitwidth["acc_bits"],
+      regen_params=False)
+  pe_area = (mac_gates * layer_mac_count * cin_unroll * cout_unroll *
+             kh_unroll * kw_unroll)
 
+  # Memory includes input, output and weight memory, translated to gates.
   memory_area = (
-      InElementPerClk * layer_quantizer_bitwidth["input_bits"]
-      + OutElementPerClk * layer_quantizer_bitwidth["output_bits"] +
+      InElementPerClk * layer_quantizer_bitwidth["input_bits"] *
+      dnc_layer_cost_ace.MemoryGatesPerBit["Register"] +
+      OutElementPerClk * layer_quantizer_bitwidth["output_bits"] *
+      dnc_layer_cost_ace.MemoryGatesPerBit["Register"] +
       np.product(layer_shapes["weight_shape"]) *
-      layer_quantizer_bitwidth["weight_bits"])
+      layer_quantizer_bitwidth["weight_bits"] *
+      dnc_layer_cost_ace.MemoryGatesPerBit["ROM"])
 
   return (pe_area + memory_area)
 
@@ -323,7 +337,7 @@ def set_best_global_cost_in_paths(
     layer_idx: Int. The index value of the current layer's predecessor.
     cur_layer_idx: current layer's index value.
     layer_quantizer_bitwidth: Dict that contains layer-related quantizer
-      bitwidth, including mac_bits, input_bits and output_bits.
+      bitwidth, including acc_bits, mac_bits, input_bits and output_bits.
     layer_mac_count: Int. Use the number of multiplication as the operation
       count. To include the number of accumulations, we should multiply the
       value by 2, assuming accumulation count ~= multiplication count.
@@ -487,10 +501,11 @@ def multiply_elements_except_none(my_tuple):
     input_size = multiply_elements_except_none(layer.input_shape[:-1])
     output_size = multiply_elements_except_none(layer.output_shape[:-1])
     target_in_throughput = target_out_throughput * input_size / output_size
-    target_pe_throughput = max(target_out_throughput, target_in_throughput)
   else:
-    target_in_throughput = target_pe_throughput = target_out_throughput
+    target_in_throughput = target_out_throughput
 
+  # Per new design, target_pe_throughput equals to target_out_throughput.
+  target_pe_throughput = target_out_throughput
   return target_in_throughput, target_pe_throughput
 
 
@@ -498,7 +513,7 @@ def calc_hw_params(graph, target_OutElementPerClk, target_out_throughput,
                    input_quantizer_bits,
                    compute_to_memory_max_ratio=4,
                    memory_to_unroll_max_ratio=4,
-                   mode=CostMode.NAIVE):
+                   mode=CostMode.ACE):
   """Calculate HW params that minimizes total cost.
 
   Args:
@@ -512,7 +527,7 @@ def calc_hw_params(graph, target_OutElementPerClk, target_out_throughput,
       ComputeOutElement and OutElement
     memory_to_unroll_max_ratio: Int. Max allowed ratio between
       InElementPerClk and CinUnroll
-    mode: CostMode. The mode to calculate per layer cost. Default is NAIVE.
+    mode: CostMode. The mode to calculate per layer cost. Default is ACE.
 
   Returns:
     best_path: Dict. Stores the best hw param value at each layer and their
@@ -580,6 +595,7 @@ def calc_hw_params(graph, target_OutElementPerClk, target_out_throughput,
     kernel_height = qtools_util.get_layer_info(cur_layer, "kernel_height")
     kernel_width = qtools_util.get_layer_info(cur_layer, "kernel_width")
     layer_type = qtools_util.get_layer_info(cur_layer, "layer_type")
+    output_channel_divisors = qtools_util.find_divisors(output_channel)
 
     logging.debug("input_channel: %d, output_channel: %d, kernel_height: %d, "
                   "kernel_width: %d, weight_quantizer_bits: %d",
@@ -620,6 +636,10 @@ def calc_hw_params(graph, target_OutElementPerClk, target_out_throughput,
         l = OutElementPerClk / ComputeOutElementPerClk
         cout_unroll = ComputeOutElementPerClk
 
+        # cout_unroll needs to be a divisor of output_channels
+        if cout_unroll not in output_channel_divisors:
+          continue
+
         logging.debug(
             ".........OutElementPerClk / ComputeOutElementPerClk = %.2f,"
             "cout_unroll=%.2f", l, cout_unroll)
@@ -708,7 +728,7 @@ def estimate_model_cost(
     target_out_throughput: float = 1.0,
     compute_to_memory_max_ratio: int = 4,
     memory_to_unroll_max_ratio: int = 4,
-    mode: CostMode = CostMode.NAIVE):
+    mode: CostMode = CostMode.ACE):
   """Main function to divide and conquer cost modeling.
 
   Args:
diff --git a/qkeras/qtools/dnc_layer_cost_ace.py b/qkeras/qtools/dnc_layer_cost_ace.py
new file mode 100644
index 00000000..b7d07da1
--- /dev/null
+++ b/qkeras/qtools/dnc_layer_cost_ace.py
@@ -0,0 +1,231 @@
+# Copyright 2019 Google LLC
+#
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+# ==============================================================================
+
+"""divide_and_conquer per layer cost modeling using ACE and data fitting.
+
+For a given layer with its hardware design params, predict its cost
+in actual ASIC implementation using ACE metric and actual MAC gates data points.
+"""
+
+import io
+import pandas as pd
+import numpy as np
+import matplotlib.pyplot as plt
+from scipy.optimize import curve_fit
+
+
+# Rule-of-thumb mapping between bits and gates in memory area estimate.
+MemoryGatesPerBit = {
+    'Register': 10.0,
+    'SRAM': 1.0,
+    'ROM': 0.1,
+}
+
+
+# Previously calculated 3D polynomial coefficients with relative MAE<5%.
+MAC_POLY3D_PARAMS = np.array([7.70469119, 13.76199652, -92.15756665])
+
+
+# MAC area data points generated from go/mac_vs_area.
+MAC24 = pd.read_csv(io.StringIO('''
+283,280,286,313,325,336,356,,
+274,290,325,372,401,428,485,,
+285,325,388,510,568,614,713,,
+308,372,509,750,865,1002,1167,,
+336,427,617,1003,1151,1309,,,
+356,480,722,1165,,,,,
+'''), header=None)
+
+MAC32 = pd.read_csv(io.StringIO('''
+391,365,377,410,453,433,458,507,
+364,382,418,466,497,521,578,685,
+378,418,485,594,659,721,832,1035,
+408,466,596,843,1029,1151,1321,1642,
+432,521,724,1153,1363,1512,1797,,
+457,578,830,1330,1551,1782,2273,,
+'''), header=None)
+
+MAC40 = pd.read_csv(io.StringIO('''
+458,457,470,500,522,527,551,605,664
+457,475,513,561,597,616,670,782,888
+470,513,579,699,766,816,928,1150,1358
+499,561,699,996,1161,1273,1499,1850,2189
+527,612,818,1275,1545,1691,2054,2516,
+549,670,927,1496,1798,2035,2490,3294,
+'''), header=None)
+
+MAC48 = pd.read_csv(io.StringIO('''
+595,550,566,594,659,624,642,694,745
+551,566,607,654,727,707,763,881,984
+566,607,679,794,871,921,1017,1270,1489
+594,655,793,1097,1285,1401,1668,2101,2378
+624,711,921,1397,1816,1950,2277,2763,3301
+642,762,1015,1669,1974,2264,2718,3631,4415
+'''), header=None)
+
+
+def mac_gates_polynomial_3d(xyz, a, b, c):
+  """Using a 3d polynomial function to model MAC area.
+
+  This function models the MAC area to be the sum of multipler, accumulator
+  and a constant shift. Particularly, multiplier area is modeled to be linear
+  # to input_bits * weight_bits, per ACE rule.
+
+  Args:
+    xyz: tuple includes input, weight and accumulator bits.
+    a: polynomial coefficient 0.
+    b: polynomial coefficient 1.
+    c: polynomial coefficient 2.
+
+  Returns:
+    MAC area predicted by the function.
+  """
+  x, y, z = xyz
+  return a * x * y + b * z + c
+
+
+def gen_mac_gate_model(do_plot=False):
+  """Generate the polynomial cost model coefficients using given data.
+
+  Args:
+    do_plot: Bool indicates whether plot the raw data and the fitted curve.
+
+  Returns:
+    params: The esitimated params of the polynomical function.
+    mae_predict: Calculate the mean absolute error of the predictions.
+    parameter_std_deviation: one standard deviation errors on the parameters,
+      indicating the uncertainties of the params.
+  """
+  # acc bits, 1st index
+  abit = np.array([24, 32, 40, 48])
+  abit = np.repeat(abit, 54)
+
+  # weight bits, 2nd index
+  wbit = np.array([1, 2, 4, 8, 12, 16])
+  wbit = np.tile(np.repeat(wbit, 9), 4)
+
+  # input bits, 3rd index
+  xbit = np.array([1, 2, 4, 8, 10, 12, 16, 24, 32])
+  xbit = np.tile(xbit, 24)
+
+  # Record all mac area data points associated with each accumulator bitwidth
+  mac_arrs = []
+  # Record the start and end index of the mac area data points
+  # associated with each accumulator bitwidth
+  mac_arrs_index = {}
+  # Record index of all valid data points
+  valid_index = []
+  start_pos = 0
+
+  for (mac_acc, acc_bits) in zip(
+      [MAC24, MAC32, MAC40, MAC48], [24, 32, 40, 48]):
+    cur_mac = mac_acc.to_numpy().reshape(-1)
+    # Filter out nan data points
+    cur_valid_index = ~np.isnan(cur_mac)
+    cur_valid_mac = cur_mac[cur_valid_index]
+    # Record the data length for each accumulator bits
+    end_pos = start_pos + len(cur_valid_mac)
+    mac_arrs_index[acc_bits] = (start_pos, end_pos)
+    # Append mac areas of each accumulator bits to a list
+    mac_arrs += list(cur_valid_mac)
+    start_pos = end_pos
+    valid_index += list(cur_valid_index)
+
+  # Filter out invalid data
+  xbit = xbit[valid_index]
+  wbit = wbit[valid_index]
+  abit = abit[valid_index]
+
+  # curve fitting for all data points
+  params, covariance = curve_fit(
+      mac_gates_polynomial_3d, (xbit, wbit, abit), mac_arrs)
+
+  # Compute one standard deviation errors on the parameters.
+  parameter_std_deviation = np.sqrt(np.diag(covariance))
+
+  # Calculate the mean absolute error between prediction and given data.
+  mac_predict = mac_gates_polynomial_3d((xbit, wbit, abit), *params)
+  mae = np.mean(np.abs(mac_predict - mac_arrs))
+  mae_predict = mae / np.mean(mac_arrs)
+
+  if do_plot:
+    # Plot all raw data points
+    fig = plt.figure(figsize=(10, 10))
+    ax = fig.add_subplot(111, projection='3d')
+
+    ax.scatter(xbit, wbit, mac_arrs, label='Data')
+
+    ax.set_xlabel('X_bits')
+    ax.set_ylabel('W_bits')
+    ax.set_zlabel('MAC')
+
+    plt.title('MAC area data points')
+    plt.show()
+
+    # Generate a mesh grid for plotting.
+    x_fit = np.linspace(min(xbit), max(xbit), 50)
+    w_fit = np.linspace(min(wbit), max(wbit), 50)
+    xmesh, wmesh = np.meshgrid(x_fit, w_fit)
+
+    fig = plt.figure(figsize=(16, 16))
+    index = 1
+
+    # Plotting 3D fitting curve for each accumulator bitwidth
+    for acc_bits in [24, 32, 40, 48]:
+      ax = fig.add_subplot(2, 2, index, projection='3d')
+
+      start_pos = mac_arrs_index[acc_bits][0]
+      end_pos = mac_arrs_index[acc_bits][1]
+      ax.scatter(xbit[start_pos:end_pos], wbit[start_pos:end_pos],
+                 mac_arrs[start_pos:end_pos], label='Data')
+
+      amesh = np.full(shape=(50, 50), fill_value=acc_bits)
+      poly_fit = mac_gates_polynomial_3d((xmesh, wmesh, amesh), *params)
+
+      ax.plot_surface(
+          xmesh, wmesh, poly_fit, cmap='viridis', alpha=0.8,
+          label=f'Fitted Surface | acc_bits={acc_bits}')
+
+      ax.set_xlabel('X')
+      ax.set_ylabel('W')
+      ax.set_zlabel('MAC')
+      ax.set_title(f'accumulator bitwidth: {acc_bits}')
+      index += 1
+
+    plt.show()
+
+  return params, mae_predict, parameter_std_deviation
+
+
+def get_ace_mac_gates(xbit, wbit, abit, regen_params=False):
+  """Function to estimate MAC area, including 1 multipler and 1 accumulator.
+
+  Args:
+    xbit: int. input bits.
+    wbit: int. weight bits.
+    abit: int. accumulator bits.
+    regen_params: Bool. If True, regenerate the MAC cost model coefficients.
+      If False, reuse the previously generated model coefficients.
+
+  Returns:
+    Estimated MAC gates.
+  """
+  if regen_params:
+    mac_params, _, _ = gen_mac_gate_model(do_plot=True)
+  else:
+    mac_params = MAC_POLY3D_PARAMS
+
+  return mac_gates_polynomial_3d((xbit, wbit, abit), *mac_params)
diff --git a/requirements.txt b/requirements.txt
index f3b6668c..736a3e83 100644
--- a/requirements.txt
+++ b/requirements.txt
@@ -1,6 +1,8 @@
 tensorflow>=2.5.0rc0
 numpy>=1.16.5
 pyparser
+pandas>=1.1.0
+matplotlib>=3.3.0
 scipy>=1.4.1
 setuptools>=41.0.0
 argparse>=1.4.0
diff --git a/tests/qtools_model_test.py b/tests/qtools_model_test.py
index 141032cd..4076f612 100644
--- a/tests/qtools_model_test.py
+++ b/tests/qtools_model_test.py
@@ -985,7 +985,7 @@ def test_divide_and_conquer_sequential_conv2d():
       target_out_throughput=1.0,
       compute_to_memory_max_ratio=1,
       memory_to_unroll_max_ratio=1,
-      mode=divide_and_conquer.CostMode.NAIVE,
+      mode=divide_and_conquer.CostMode.ACE,
   )