From 12354124024873562a8cb578b9c24e4de54c2086 Mon Sep 17 00:00:00 2001
From: David Tanner <guacamoleo@gmail.com>
Date: Fri, 24 Feb 2017 09:31:13 -0600
Subject: [PATCH 01/21] fixing innocuous indentation error

---
 Tensile/SolutionWriter.py | 30 +++++++++++++++---------------
 1 file changed, 15 insertions(+), 15 deletions(-)

diff --git a/Tensile/SolutionWriter.py b/Tensile/SolutionWriter.py
index c09b72d70..289bd1940 100644
--- a/Tensile/SolutionWriter.py
+++ b/Tensile/SolutionWriter.py
@@ -830,21 +830,21 @@ def solutionParametersConsistent(solution):
     else:
       solution["NumLoadsB"] = totalElementsB / numThreads
 
-      # how many loads para
-      if solution["NumLoadsA"] % solution["NumLoadsCoalescedA"] != 0:
-        if printReason: print2("numLoadsA %u %% numLoadsParaA %u != 0" \
-            % (solution["NumLoadsA"], solution["NumLoadsCoalescedA"]))
-        return False
-      else:
-        solution["NumLoadsPerpendicularA"] = solution["NumLoadsA"] \
-            / solution["NumLoadsCoalescedA"]
-      if solution["NumLoadsB"] % solution["NumLoadsCoalescedB"] != 0:
-        if printReason: print2("numLoadsB %u %% numLoadsParaB %u != 0" \
-            % (solution["NumLoadsB"], solution["NumLoadsCoalescedB"]))
-        return False
-      else:
-        solution["NumLoadsPerpendicularB"] = solution["NumLoadsB"] \
-            / solution["NumLoadsCoalescedB"]
+    # how many loads para
+    if solution["NumLoadsA"] % solution["NumLoadsCoalescedA"] != 0:
+      if printReason: print2("numLoadsA %u %% numLoadsParaA %u != 0" \
+          % (solution["NumLoadsA"], solution["NumLoadsCoalescedA"]))
+      return False
+    else:
+      solution["NumLoadsPerpendicularA"] = solution["NumLoadsA"] \
+          / solution["NumLoadsCoalescedA"]
+    if solution["NumLoadsB"] % solution["NumLoadsCoalescedB"] != 0:
+      if printReason: print2("numLoadsB %u %% numLoadsParaB %u != 0" \
+          % (solution["NumLoadsB"], solution["NumLoadsCoalescedB"]))
+      return False
+    else:
+      solution["NumLoadsPerpendicularB"] = solution["NumLoadsB"] \
+          / solution["NumLoadsCoalescedB"]
 
     # load size para/perp A
     if totalElementsParaA % solution["NumLoadsCoalescedA"] != 0:

From 137cd977383f20a3bc11a37729fb7cd02bce6d42 Mon Sep 17 00:00:00 2001
From: David Tanner <guacamoleo@gmail.com>
Date: Fri, 24 Feb 2017 10:18:31 -0600
Subject: [PATCH 02/21] fixing solution validity and derived parameters

---
 Tensile/BenchmarkProblems.py |  14 +++-
 Tensile/SolutionStructs.py   | 145 ++++++++++++++++++++++++++++++++++-
 Tensile/SolutionWriter.py    |   4 +
 3 files changed, 156 insertions(+), 7 deletions(-)

diff --git a/Tensile/BenchmarkProblems.py b/Tensile/BenchmarkProblems.py
index 8ec4ed719..4e1c3f932 100644
--- a/Tensile/BenchmarkProblems.py
+++ b/Tensile/BenchmarkProblems.py
@@ -169,8 +169,14 @@ def benchmarkProblemType( config ):
                 benchmarkStep.initialSolutionParameters[initialSolutionParameterName]
         # TODO check if solution matches problem size for exact tile kernels
         solutionObject = Solution(solution)
-        if SolutionWriter.solutionParametersConsistent(solutionObject):
-          solutions[hardcodedIdx].append(solutionObject)
+        if solutionObject["Valid"]:
+          print str(solutionObject), " LOOKING"
+          for s in solutions:
+            print s
+          if solutionObject not in solutions:
+            solutions[hardcodedIdx].append(solutionObject)
+          else:
+            print "SOLUTION ALREADY IN SOLUTIONS"
           if globalParameters["PrintLevel"] >= 1:
             sys.stdout.write("|")
         else:
@@ -541,8 +547,8 @@ def get( lookupHardcodedParameters, winners ):
           #for paramName in hardcodedFrozen:
           #  paramValue = hardcodedFrozen[paramName]
           #  matchUnion[paramName] = paramValue
-          Solution.assignDimsFromEdgeAndShape(matchUnion)
-          Solution.assignDimsFromEdgeAndShape(hardcodedFrozen.parameters)
+          Solution.assignDerivedParameters(matchUnion)
+          Solution.assignDerivedParameters(hardcodedFrozen.parameters)
           if matchUnion["MacroTile0"] != lookupMacroTile0 \
               or matchUnion["MacroTile1"] != lookupMacroTile1:
             matchMacroTile = False
diff --git a/Tensile/SolutionStructs.py b/Tensile/SolutionStructs.py
index b21f98728..e1fd318ca 100644
--- a/Tensile/SolutionStructs.py
+++ b/Tensile/SolutionStructs.py
@@ -505,7 +505,7 @@ def __init__(self, config):
       if key != "ProblemType" and key not in self.state:
         self.state[key] = config[key]
 
-    Solution.assignDimsFromEdgeAndShape(self.state)
+    Solution.assignDerivedParameters(self.state)
 
   ########################################
   # get a list of kernel parameters for this solution
@@ -531,7 +531,8 @@ def getKernels(self):
   ########################################
   # assign Dim0, 1 based on edge and shape
   @staticmethod
-  def assignDimsFromEdgeAndShape(state):
+  def assignDerivedParameters(state):
+
     # workgroup sizes
     state["WorkGroup0"] = state["WorkGroupEdge"]
     state["WorkGroup1"] = state["WorkGroupEdge"]
@@ -556,6 +557,143 @@ def assignDimsFromEdgeAndShape(state):
     if "SplitU" in state and "LoopUnroll" in state:
       state["DepthU"] = state["SplitU"] * state["LoopUnroll"]
 
+    printReason = False
+
+    # num threads
+    state["NumThreads"] = state["WorkGroup0"]*state["WorkGroup1"]
+    if state["NumThreads"] > globalParameters["MaxThreads"]:
+      if printReason: print2("rejecting %u threads" % state["NumThreads"])
+      state["Valid"] = False
+      return
+
+    # how many elements to load
+    if state["ProblemType"]["TLUA"]:
+      totalElementsParaA = state["MacroTile0"]
+      totalElementsPerpA = state["LoopUnroll"]
+    else:
+      totalElementsParaA = state["LoopUnroll"]
+      totalElementsPerpA = state["MacroTile0"]
+
+    if state["ProblemType"]["TLUB"]:
+      totalElementsParaB = state["MacroTile1"]
+      totalElementsPerpB = state["LoopUnroll"]
+    else:
+      totalElementsParaB = state["LoopUnroll"]
+      totalElementsPerpB = state["MacroTile1"]
+    totalElementsA = totalElementsParaA * totalElementsPerpA
+    totalElementsB = totalElementsParaB * totalElementsPerpB
+
+    # how many load instructions
+    if totalElementsA % state["NumThreads"] != 0:
+      if printReason: print2("totalElementsA %u %% NumThreads %u != 0" \
+          % (totalElementsA, state["NumThreads"]))
+      state["Valid"] = False
+      return
+    else:
+      state["NumLoadsA"] = totalElementsA / state["NumThreads"]
+    if totalElementsB % state["NumThreads"] != 0:
+      if printReason: print2("totalElementsB %u %% NumThreads %u != 0" \
+          % (totalElementsB, state["NumThreads"]))
+      state["Valid"] = False
+      return
+    else:
+      state["NumLoadsB"] = totalElementsB / state["NumThreads"]
+
+    # how many loads para
+    if state["NumLoadsCoalescedA"] < 1:
+      state["NumLoadsCoalescedA"] = state["NumLoadsA"]
+      print "Assigning NLCA=%u" % state["NumLoadsA"]
+    if state["NumLoadsA"] % state["NumLoadsCoalescedA"] != 0:
+      if printReason: print2("numLoadsA %u %% numLoadsParaA %u != 0" \
+          % (state["NumLoadsA"], state["NumLoadsCoalescedA"]))
+      state["Valid"] = False
+      return
+    else:
+      state["NumLoadsPerpendicularA"] = state["NumLoadsA"] \
+          / state["NumLoadsCoalescedA"]
+    if state["NumLoadsCoalescedB"] < 1:
+      state["NumLoadsCoalescedB"] = state["NumLoadsB"]
+      print "Assigning NLCB=%u" % state["NumLoadsB"]
+    if state["NumLoadsB"] % state["NumLoadsCoalescedB"] != 0:
+      if printReason: print2("numLoadsB %u %% numLoadsParaB %u != 0" \
+          % (state["NumLoadsB"], state["NumLoadsCoalescedB"]))
+      state["Valid"] = False
+      return
+    else:
+      state["NumLoadsPerpendicularB"] = state["NumLoadsB"] \
+          / state["NumLoadsCoalescedB"]
+
+    # load size para/perp A
+    if totalElementsParaA % state["NumLoadsCoalescedA"] != 0:
+      if printReason: print2("totalElementsParaA %u %% numLoadsParaA %u != 0" \
+          % (totalElementsParaA, state["NumLoadsCoalescedA"]))
+      state["Valid"] = False
+      return
+    #else:
+    #  loadSizeParaA = totalElementsParaA / state["NumLoadsCoalescedA"]
+    if totalElementsPerpA % state["NumLoadsPerpendicularA"] != 0:
+      if printReason: print2("totalElementsPerpA %u %% numLoadsPerpA %u != 0" \
+          % (totalElementsPerpA, state["NumLoadsPerpendicularA"]))
+      state["Valid"] = False
+      return
+    #else:
+    #  loadSizePerpA = totalElementsPerpA / state["NumLoadsPerpendicularA"]
+
+    # load size para/perp B
+    if totalElementsParaB % state["NumLoadsCoalescedB"] != 0:
+      if printReason: print2("totalElementsParaB %u %% numLoadsParaB %u != 0" \
+          % (totalElementsParaB, state["NumLoadsCoalescedB"]))
+      state["Valid"] = False
+      return
+    #else:
+    #  loadSizeParaB = totalElementsParaB / state["NumLoadsCoalescedB"]
+    if totalElementsPerpB % state["NumLoadsPerpendicularB"] != 0:
+      if printReason: print2("totalElementsPerpB %u %% numLoadsPerpB %u != 0" \
+          % (totalElementsPerpB, state["NumLoadsPerpendicularB"]))
+      state["Valid"] = False
+      return
+    #else:
+    #  loadSizePerpB = totalElementsPerpB / state["NumLoadsPerpendicularB"]
+
+    # too much LDS
+    sizeLDS = state["LoopUnroll"] \
+        * (state["PadLDS"] * 2 + state["MacroTile0"] \
+        + state["MacroTile1"] ) \
+        * state["ProblemType"]["DataType"].numBytes()
+    if sizeLDS > globalParameters["MaxLDS"]:
+      if printReason: print2("Kernel Uses %u > %u bytes" % ( sizeLDS, globalParameters["MaxLDS"]))
+      state["Valid"] = False
+      return
+
+    # Compiler may be causing incorrect spills on ROCm1.4 from DT on 2/21/17
+    if globalParameters["Backend"] == "HIP":
+      if state["ProblemType"]["DataType"].value == DataType.single:
+        if state["MacroTile0"] == 128 or state["MacroTile1"] == 128:
+          if state["NumLoadsCoalescedA"] != 1 and state["NumLoadsCoalescedB"] != 8:
+            state["Valid"] = False
+            return
+      elif state["ProblemType"]["DataType"].value == DataType.double:
+        if globalParameters["Backend"] == "HIP":
+          if state["MacroTile0"] >= 64 or state["MacroTile1"] >= 64:
+            state["Valid"] = False
+            return
+
+    state["Valid"] = True
+
+# validation failures
+# Cijk_Ailk_Bjlk_SB_DU16_LU16_MT064_MT164_NLA16_NLB16_NLCA02_NLCB01_NLPA08_NLPB16_TT008_TT108_TTE08_WG008_WG108_WGE08
+# Cijk_Ailk_Bjlk_SB_DU16_LU16_MT064_MT164_NLA16_NLB16_NLCA04_NLCB02_NLPA04_NLPB08_TT008_TT108_TTE08_WG008_WG108_WGE08
+# Cijk_Ailk_Bjlk_SB_DU16_LU16_MT064_MT164_NLA16_NLB16_NLCA02_NLCB04_NLPA08_NLPB04_TT008_TT108_TTE08_WG008_WG108_WGE08
+
+# Cijk_Ailk_Bjlk_DB_DU16_LU16_MT064_MT164_NLA16_NLB16_NLCA04_NLCB01_NLPA04_NLPB16_TT008_TT108_TTE08_WG008_WG108_WGE08
+# Cijk_Ailk_Bjlk_DB_DU08_LU08_MT064_MT164_NLA08_NLB08_NLCA01_NLCB01_NLPA08_NLPB08_TT008_TT108_TTE08_WG008_WG108_WGE08
+# Cijk_Ailk_Bjlk_DB_DU08_LU08_MT064_MT164_NLA08_NLB08_NLCA08_NLCB01_NLPA01_NLPB08_TT008_TT108_TTE08_WG008_WG108_WGE08
+# Cijk_Ailk_Bjlk_DB_DU08_LU08_MT064_MT164_NLA08_NLB08_NLCA08_NLCB08_NLPA01_NLPB01_TT008_TT108_TTE08_WG008_WG108_WGE08
+# Cijk_Ailk_Bjlk_DB_DU16_LU16_MT064_MT164_NLA16_NLB16_NLCA08_NLCB08_NLPA02_NLPB02_TT008_TT108_TTE08_WG008_WG108_WGE08
+# Cijk_Ailk_Bjlk_DB_DU08_LU08_MT064_MT164_NLA08_NLB08_NLCA01_NLCB08_NLPA08_NLPB01_TT008_TT108_TTE08_WG008_WG108_WGE08
+
+
+
 
   ########################################
   # create a dictionary with booleans on whether to include parameter in name
@@ -746,7 +884,8 @@ def __hash__(self):
     return hash(str(self))
     #return hash(self.getAttributes())
   def __eq__(self, other):
-    return isinstance(other, Solution) and self.getAttributes() == other.getAttributes()
+    #return isinstance(other, Solution) and self.getAttributes() == other.getAttributes()
+    return isinstance(other, Solution) and str(self) == str(other)
   def __ne__(self, other):
     result = self.__eq__(other)
     if result is NotImplemented:
diff --git a/Tensile/SolutionWriter.py b/Tensile/SolutionWriter.py
index 289bd1940..452c9c45e 100644
--- a/Tensile/SolutionWriter.py
+++ b/Tensile/SolutionWriter.py
@@ -831,6 +831,8 @@ def solutionParametersConsistent(solution):
       solution["NumLoadsB"] = totalElementsB / numThreads
 
     # how many loads para
+    if solution["NumLoadsCoalescedA"] < 1:
+      solution["NumLoadsCoalescedA"] = solution["NumLoadsA"]
     if solution["NumLoadsA"] % solution["NumLoadsCoalescedA"] != 0:
       if printReason: print2("numLoadsA %u %% numLoadsParaA %u != 0" \
           % (solution["NumLoadsA"], solution["NumLoadsCoalescedA"]))
@@ -838,6 +840,8 @@ def solutionParametersConsistent(solution):
     else:
       solution["NumLoadsPerpendicularA"] = solution["NumLoadsA"] \
           / solution["NumLoadsCoalescedA"]
+    if solution["NumLoadsCoalescedB"] < 1:
+      solution["NumLoadsCoalescedB"] = solution["NumLoadsB"]
     if solution["NumLoadsB"] % solution["NumLoadsCoalescedB"] != 0:
       if printReason: print2("numLoadsB %u %% numLoadsParaB %u != 0" \
           % (solution["NumLoadsB"], solution["NumLoadsCoalescedB"]))

From d795ca6e65501423553b6b760f2ab9b0000b60d9 Mon Sep 17 00:00:00 2001
From: David Tanner <guacamoleo@gmail.com>
Date: Fri, 24 Feb 2017 12:34:33 -0600
Subject: [PATCH 03/21] fixed NumLoadsCoalesced=-1, assigningDerrivedParameters

---
 Tensile/BenchmarkProblems.py |  18 +--
 Tensile/SolutionStructs.py   | 213 +++++++++++++++++++----------------
 Tensile/__init__.py          |   2 +-
 3 files changed, 123 insertions(+), 110 deletions(-)

diff --git a/Tensile/BenchmarkProblems.py b/Tensile/BenchmarkProblems.py
index 4e1c3f932..cd0bbc575 100644
--- a/Tensile/BenchmarkProblems.py
+++ b/Tensile/BenchmarkProblems.py
@@ -37,7 +37,7 @@ def benchmarkProblemType( config ):
 
   totalBenchmarkSteps = len(benchmarkProcess)
   winners = WinningParameterDict()
-  print1("NumBenchmarkSteps: %u" % totalBenchmarkSteps)
+  print1("# NumBenchmarkSteps: %u" % totalBenchmarkSteps)
   print1("")
   print1(HR)
   print1("# Done Creating BenchmarkProcess Object")
@@ -170,13 +170,13 @@ def benchmarkProblemType( config ):
         # TODO check if solution matches problem size for exact tile kernels
         solutionObject = Solution(solution)
         if solutionObject["Valid"]:
-          print str(solutionObject), " LOOKING"
-          for s in solutions:
-            print s
-          if solutionObject not in solutions:
+          hasSolution = False
+          for hardcodedSolutions in solutions:
+            for hardcodedSolution in hardcodedSolutions:
+              if hardcodedSolution == solutionObject:
+                hasSolution = True
+          if not hasSolution:
             solutions[hardcodedIdx].append(solutionObject)
-          else:
-            print "SOLUTION ALREADY IN SOLUTIONS"
           if globalParameters["PrintLevel"] >= 1:
             sys.stdout.write("|")
         else:
@@ -547,8 +547,8 @@ def get( lookupHardcodedParameters, winners ):
           #for paramName in hardcodedFrozen:
           #  paramValue = hardcodedFrozen[paramName]
           #  matchUnion[paramName] = paramValue
-          Solution.assignDerivedParameters(matchUnion)
-          Solution.assignDerivedParameters(hardcodedFrozen.parameters)
+          Solution.assignProblemIndependentDerivedParameters(matchUnion)
+          Solution.assignProblemIndependentDerivedParameters(hardcodedFrozen.parameters)
           if matchUnion["MacroTile0"] != lookupMacroTile0 \
               or matchUnion["MacroTile1"] != lookupMacroTile1:
             matchMacroTile = False
diff --git a/Tensile/SolutionStructs.py b/Tensile/SolutionStructs.py
index e1fd318ca..5f4eff66e 100644
--- a/Tensile/SolutionStructs.py
+++ b/Tensile/SolutionStructs.py
@@ -95,9 +95,9 @@ def zeroString(self, backend):
     zeroString = "(%s)(" % self.toDevice(backend)
     if self.value == self.single or self.value == self.half:
       zeroString += "0.f"
-    elif self.value == self.double: 
+    elif self.value == self.double:
       zeroString += "0.0"
-    elif self.value == self.complexSingle: 
+    elif self.value == self.complexSingle:
       zeroString += "0.f, 0.f"
     elif self.value == self.complexDouble:
       zeroString += "0.0, 0.0"
@@ -214,7 +214,8 @@ def __init__(self, config):
     elif self["OperationType"] == "TensorContraction":
       self.initTensorContraction(config)
 
-    self.assignIndices()
+    self.state["AssignedDerivedParameters"] = False
+    ProblemType.assignDerivedParameters(self.state)
 
 
   ########################################
@@ -249,90 +250,98 @@ def isTensorContraction(self):
 
   ########################################
   # determine d0, d1, dU
-  def assignIndices(self):
-    self["TotalIndices"] = max(max(self["IndexAssignmentsA"])+1, max(self["IndexAssignmentsB"])+1)
+  @staticmethod
+  def assignDerivedParameters(state):
+    if "AssignedDerivedParameters" in state:
+      if state["AssignedDerivedParameters"]:
+        return
+    state["AssignedDerivedParameters"] = False
+
+    state["TotalIndices"] = max(max(state["IndexAssignmentsA"])+1, \
+        max(state["IndexAssignmentsB"])+1)
 
     # determine num free, batch
-    self["IndicesFree"] = []
-    self["IndicesBatch"] = []
-    self["IndicesSummation"] = []
+    state["IndicesFree"] = []
+    state["IndicesBatch"] = []
+    state["IndicesSummation"] = []
 
-    for i in range(0, self["NumIndicesC"]):
-      inA = i in self["IndexAssignmentsA"]
-      inB = i in self["IndexAssignmentsB"]
+    for i in range(0, state["NumIndicesC"]):
+      inA = i in state["IndexAssignmentsA"]
+      inB = i in state["IndexAssignmentsB"]
       if inA and inB:
-        #self["NumIndicesBatch"] = (i+1)-self["NumIndicesFree"]
-        self["IndicesBatch"].append(i)
+        #state["NumIndicesBatch"] = (i+1)-state["NumIndicesFree"]
+        state["IndicesBatch"].append(i)
 
       elif inA or inB:
-        #self["NumIndicesFree"] = (i+1)
-        self["IndicesFree"].append(i)
+        #state["NumIndicesFree"] = (i+1)
+        state["IndicesFree"].append(i)
       else:
         printExit("invalid index %u" % i)
 
     # determine num summation
-    for i in range(self["NumIndicesC"], self["TotalIndices"]):
-      inA = i in self["IndexAssignmentsA"]
-      inB = i in self["IndexAssignmentsB"]
+    for i in range(state["NumIndicesC"], state["TotalIndices"]):
+      inA = i in state["IndexAssignmentsA"]
+      inB = i in state["IndexAssignmentsB"]
       if inA and inB:
-        #self["NumIndicesSummation"] = (i+1)-self["NumIndicesC"]
-        self.state["IndicesSummation"].append(i)
+        #state["NumIndicesSummation"] = (i+1)-state["NumIndicesC"]
+        state["IndicesSummation"].append(i)
       else:
         printExit("invalid index %u" % i)
-    self["NumIndicesFree"] = len(self["IndicesFree"])
-    self["NumIndicesBatch"] = len(self["IndicesBatch"])
-    self["NumIndicesSummation"] = len(self["IndicesSummation"])
+    state["NumIndicesFree"] = len(state["IndicesFree"])
+    state["NumIndicesBatch"] = len(state["IndicesBatch"])
+    state["NumIndicesSummation"] = len(state["IndicesSummation"])
 
 
     # by default, unroll index will be the first summation index
     # TODO sort summation indices by "stride"
-    self["IndexUnroll"] = self["IndicesSummation"][0]
-    for i in range(0, len(self["IndexAssignmentsA"])):
-      if self["IndexAssignmentsA"][i] == self["IndexUnroll"]:
-        self["IndexUnrollA"] = i
+    state["IndexUnroll"] = state["IndicesSummation"][0]
+    for i in range(0, len(state["IndexAssignmentsA"])):
+      if state["IndexAssignmentsA"][i] == state["IndexUnroll"]:
+        state["IndexUnrollA"] = i
         break
-    for i in range(0, len(self["IndexAssignmentsB"])):
-      if self["IndexAssignmentsB"][i] == self["IndexUnroll"]:
-        self["IndexUnrollB"] = i
+    for i in range(0, len(state["IndexAssignmentsB"])):
+      if state["IndexAssignmentsB"][i] == state["IndexUnroll"]:
+        state["IndexUnrollB"] = i
         break
 
     # assign d0, d1
-    self["Index01A"] = -1
-    self["Index01B"] = -1
-    for i in self["IndexAssignmentsA"]:
-      if i < self["NumIndicesC"]:
-        self["Index01A"] = i
+    state["Index01A"] = -1
+    state["Index01B"] = -1
+    for i in state["IndexAssignmentsA"]:
+      if i < state["NumIndicesC"]:
+        state["Index01A"] = i
         break
-    for i in self["IndexAssignmentsB"]:
-      if i < self["NumIndicesC"]:
-        self["Index01B"] = i
+    for i in state["IndexAssignmentsB"]:
+      if i < state["NumIndicesC"]:
+        state["Index01B"] = i
         break
     # whichever has lower stride in C (lower value), is 0, other is 1
-    if self["Index01A"] < self["Index01B"]:
-      self["Index0"]  = self["Index01A"]
-      self["Index1"]  = self["Index01B"]
-      self["Tensor0"] = 0
-      self["Tensor1"] = 1
-      self["TileA"] = 0
-      self["TileB"] = 1
+    if state["Index01A"] < state["Index01B"]:
+      state["Index0"]  = state["Index01A"]
+      state["Index1"]  = state["Index01B"]
+      state["Tensor0"] = 0
+      state["Tensor1"] = 1
+      state["TileA"] = 0
+      state["TileB"] = 1
     else:
-      self["Index0"]  = self["Index01B"]
-      self["Index1"]  = self["Index01A"]
-      self["Tensor0"] = 1
-      self["Tensor1"] = 0
-      self["TileA"] = 1
-      self["TileB"] = 0
+      state["Index0"]  = state["Index01B"]
+      state["Index1"]  = state["Index01A"]
+      state["Tensor0"] = 1
+      state["Tensor1"] = 0
+      state["TileA"] = 1
+      state["TileB"] = 0
 
     # generalize transpose
-    strideIdxA = self["IndexAssignmentsA"].index(self["Index01A"])
-    strideIdxB = self["IndexAssignmentsB"].index(self["Index01B"])
-    unrollIdxA = self["IndexAssignmentsA"].index(self["IndexUnroll"])
-    unrollIdxB = self["IndexAssignmentsB"].index(self["IndexUnroll"])
-    self["TLUA"] = strideIdxA < unrollIdxA
-    self["TLUB"] = strideIdxB < unrollIdxB
+    strideIdxA = state["IndexAssignmentsA"].index(state["Index01A"])
+    strideIdxB = state["IndexAssignmentsB"].index(state["Index01B"])
+    unrollIdxA = state["IndexAssignmentsA"].index(state["IndexUnroll"])
+    unrollIdxB = state["IndexAssignmentsB"].index(state["IndexUnroll"])
+    state["TLUA"] = strideIdxA < unrollIdxA
+    state["TLUB"] = strideIdxB < unrollIdxB
 
     #unrollDimStrideGreaterThanTileDimStrideA = TLUA
     #unrollDimStrideLessThanTileDimStrideB    = !TLUB
+    state["AssignedDerivedParameters"] = True
 
 
 
@@ -364,6 +373,18 @@ def __str__(self):
     if self["UseInitialStrides"]: name += "I"
     return name
 
+  def keys(self):
+    return self.state.keys()
+  def __len__(self):
+    return len(self.state)
+  def __iter__(self):
+    return iter(self.state)
+
+
+
+
+
+
   def __getitem__(self, key):
     return self.state[key]
   def __setitem__(self, key, value):
@@ -504,7 +525,9 @@ def __init__(self, config):
     for key in config:
       if key != "ProblemType" and key not in self.state:
         self.state[key] = config[key]
-
+    self["Valid"] = True
+    self["AssignedProblemIndependentDerivedParameters"] = False
+    self["AssignedDerivedParameters"] = False
     Solution.assignDerivedParameters(self.state)
 
   ########################################
@@ -529,10 +552,13 @@ def getKernels(self):
 
 
   ########################################
-  # assign Dim0, 1 based on edge and shape
+  # assign tile sizes
   @staticmethod
-  def assignDerivedParameters(state):
-
+  def assignProblemIndependentDerivedParameters(state):
+    if "AssignedProblemIndependentDerivedParameters" in state:
+      if state["AssignedProblemIndependentDerivedParameters"]:
+        return
+    state["AssignedProblemIndependentDerivedParameters"] = False
     # workgroup sizes
     state["WorkGroup0"] = state["WorkGroupEdge"]
     state["WorkGroup1"] = state["WorkGroupEdge"]
@@ -557,14 +583,25 @@ def assignDerivedParameters(state):
     if "SplitU" in state and "LoopUnroll" in state:
       state["DepthU"] = state["SplitU"] * state["LoopUnroll"]
 
-    printReason = False
-
     # num threads
     state["NumThreads"] = state["WorkGroup0"]*state["WorkGroup1"]
     if state["NumThreads"] > globalParameters["MaxThreads"]:
       if printReason: print2("rejecting %u threads" % state["NumThreads"])
       state["Valid"] = False
-      return
+    state["AssignedProblemIndependentDerivedParameters"] = True
+
+  ########################################
+  # assign all derived parameters
+  @staticmethod
+  def assignDerivedParameters(state):
+    Solution.assignProblemIndependentDerivedParameters(state)
+    if "AssignedDerivedParameters" in state:
+      if state["AssignedDerivedParameters"]:
+        return
+    state["AssignedDerivedParameters"] = False
+
+    ProblemType.assignDerivedParameters(state["ProblemType"])
+    printReason = False
 
     # how many elements to load
     if state["ProblemType"]["TLUA"]:
@@ -588,37 +625,35 @@ def assignDerivedParameters(state):
       if printReason: print2("totalElementsA %u %% NumThreads %u != 0" \
           % (totalElementsA, state["NumThreads"]))
       state["Valid"] = False
-      return
+      #return
     else:
       state["NumLoadsA"] = totalElementsA / state["NumThreads"]
     if totalElementsB % state["NumThreads"] != 0:
       if printReason: print2("totalElementsB %u %% NumThreads %u != 0" \
           % (totalElementsB, state["NumThreads"]))
       state["Valid"] = False
-      return
+      #return
     else:
       state["NumLoadsB"] = totalElementsB / state["NumThreads"]
 
     # how many loads para
     if state["NumLoadsCoalescedA"] < 1:
       state["NumLoadsCoalescedA"] = state["NumLoadsA"]
-      print "Assigning NLCA=%u" % state["NumLoadsA"]
     if state["NumLoadsA"] % state["NumLoadsCoalescedA"] != 0:
       if printReason: print2("numLoadsA %u %% numLoadsParaA %u != 0" \
           % (state["NumLoadsA"], state["NumLoadsCoalescedA"]))
       state["Valid"] = False
-      return
+      #return
     else:
       state["NumLoadsPerpendicularA"] = state["NumLoadsA"] \
           / state["NumLoadsCoalescedA"]
     if state["NumLoadsCoalescedB"] < 1:
       state["NumLoadsCoalescedB"] = state["NumLoadsB"]
-      print "Assigning NLCB=%u" % state["NumLoadsB"]
     if state["NumLoadsB"] % state["NumLoadsCoalescedB"] != 0:
       if printReason: print2("numLoadsB %u %% numLoadsParaB %u != 0" \
           % (state["NumLoadsB"], state["NumLoadsCoalescedB"]))
       state["Valid"] = False
-      return
+      #return
     else:
       state["NumLoadsPerpendicularB"] = state["NumLoadsB"] \
           / state["NumLoadsCoalescedB"]
@@ -628,14 +663,14 @@ def assignDerivedParameters(state):
       if printReason: print2("totalElementsParaA %u %% numLoadsParaA %u != 0" \
           % (totalElementsParaA, state["NumLoadsCoalescedA"]))
       state["Valid"] = False
-      return
+      #return
     #else:
     #  loadSizeParaA = totalElementsParaA / state["NumLoadsCoalescedA"]
     if totalElementsPerpA % state["NumLoadsPerpendicularA"] != 0:
       if printReason: print2("totalElementsPerpA %u %% numLoadsPerpA %u != 0" \
           % (totalElementsPerpA, state["NumLoadsPerpendicularA"]))
       state["Valid"] = False
-      return
+      #return
     #else:
     #  loadSizePerpA = totalElementsPerpA / state["NumLoadsPerpendicularA"]
 
@@ -644,14 +679,14 @@ def assignDerivedParameters(state):
       if printReason: print2("totalElementsParaB %u %% numLoadsParaB %u != 0" \
           % (totalElementsParaB, state["NumLoadsCoalescedB"]))
       state["Valid"] = False
-      return
+      #return
     #else:
     #  loadSizeParaB = totalElementsParaB / state["NumLoadsCoalescedB"]
     if totalElementsPerpB % state["NumLoadsPerpendicularB"] != 0:
       if printReason: print2("totalElementsPerpB %u %% numLoadsPerpB %u != 0" \
           % (totalElementsPerpB, state["NumLoadsPerpendicularB"]))
       state["Valid"] = False
-      return
+      #return
     #else:
     #  loadSizePerpB = totalElementsPerpB / state["NumLoadsPerpendicularB"]
 
@@ -663,7 +698,7 @@ def assignDerivedParameters(state):
     if sizeLDS > globalParameters["MaxLDS"]:
       if printReason: print2("Kernel Uses %u > %u bytes" % ( sizeLDS, globalParameters["MaxLDS"]))
       state["Valid"] = False
-      return
+      #return
 
     # Compiler may be causing incorrect spills on ROCm1.4 from DT on 2/21/17
     if globalParameters["Backend"] == "HIP":
@@ -671,14 +706,14 @@ def assignDerivedParameters(state):
         if state["MacroTile0"] == 128 or state["MacroTile1"] == 128:
           if state["NumLoadsCoalescedA"] != 1 and state["NumLoadsCoalescedB"] != 8:
             state["Valid"] = False
-            return
+            #return
       elif state["ProblemType"]["DataType"].value == DataType.double:
         if globalParameters["Backend"] == "HIP":
           if state["MacroTile0"] >= 64 or state["MacroTile1"] >= 64:
             state["Valid"] = False
-            return
+            #return
+    state["AssignedDerivedParameters"] = True
 
-    state["Valid"] = True
 
 # validation failures
 # Cijk_Ailk_Bjlk_SB_DU16_LU16_MT064_MT164_NLA16_NLB16_NLCA02_NLCB01_NLPA08_NLPB16_TT008_TT108_TTE08_WG008_WG108_WGE08
@@ -694,7 +729,6 @@ def assignDerivedParameters(state):
 
 
 
-
   ########################################
   # create a dictionary with booleans on whether to include parameter in name
   @staticmethod
@@ -755,8 +789,6 @@ def getNameMin(state, requiredParameters):
             first = False
           name += "%s%s" % ( Solution.getParameterNameAbbreviation(key), \
               Solution.getParameterValueAbbreviation(state[key]) )
-      #else:
-      #  print "%s not in %s" % (key, requiredParameters)
     return name
 
   ########################################
@@ -765,26 +797,20 @@ def getNameMin(state, requiredParameters):
   def getSerialNaming(objs):
     data = {}
     for objIdx in range(0, len(objs)):
-      #print "ObjIdx: %u" % objIdx
       obj = objs[objIdx]
       for paramName in sorted(obj.keys()):
         if paramName not in derrivedParameters:
           paramValue = obj[paramName]
-          #if paramName == "ThreadTileEdge":
-          #  print "%s = %s" % (paramName, paramValue)
           if paramName in data:
             if paramValue not in data[paramName]:
               data[paramName].append(paramValue)
           else:
             data[paramName] = [ paramValue ]
     maxObjs = 1
-    #print "SerialNaming:"
     for paramName in data:
       data[paramName] = sorted(data[paramName])
-      #print "%s: %s" % (paramName, data[paramName])
       maxObjs *= len(data[paramName])
     numDigits = len(str(maxObjs))
-    #print "MaxSerialNames: %u (%u)" % (maxObjs, numDigits)
     return [ data, numDigits ]
 
   ########################################
@@ -803,23 +829,10 @@ def getNameSerial(state, serialNaming):
         paramNameMultiplier = len(paramData)
         if paramValue in paramData:
           paramValueIdx = paramData.index(paramValue)
-        #else:
-          #print "ERROR %s: %s not in %s" % ( paramName, paramValue, paramData )
-          #print state
-          #printExit()
-        #if paramNameMultiplier > 1:
-          #print "serial = %u*%u + %u; multiplier = %u * %u; %s::%s in %s" % ( \
-          #    paramValueIdx, multiplier, serial, \
-          #    paramNameMultiplier, multiplier, \
-          #    paramName, paramValue, paramData[1] )
-
         serial += paramValueIdx * multiplier
         multiplier *= paramNameMultiplier
-    #if serial == 0:
-    #  print state
     name = "%s%0*u" % ("S" if isinstance(state, Solution) else "K", \
         numDigits, serial)
-    #print "SerialName: %s" % name
     return name
 
 
diff --git a/Tensile/__init__.py b/Tensile/__init__.py
index 58039f505..4eabd0b3f 100644
--- a/Tensile/__init__.py
+++ b/Tensile/__init__.py
@@ -1 +1 @@
-__version__ = "2.1.1"
+__version__ = "2.1.2"

From 443895ae7a02b0aa751cabda9e9f934707543518 Mon Sep 17 00:00:00 2001
From: David Tanner <guacamoleo@gmail.com>
Date: Fri, 24 Feb 2017 12:48:15 -0600
Subject: [PATCH 04/21] fixed removing derived parameters from naming

---
 Tensile/Common.py          |  5 +++--
 Tensile/SolutionStructs.py | 38 +++++++++++++++++++++-----------------
 Tensile/__init__.py        |  2 +-
 3 files changed, 25 insertions(+), 20 deletions(-)

diff --git a/Tensile/Common.py b/Tensile/Common.py
index 8576813b2..254985c7a 100644
--- a/Tensile/Common.py
+++ b/Tensile/Common.py
@@ -96,8 +96,8 @@
     {"VectorWidthLocalStore":   [ 4 ] },
     ]
 
-# derrived parameters may show up in solution dict but don't use for naming
-derrivedParameters = [
+# derived parameters may show up in solution dict but don't use for naming
+derivedParameters = [
     "MacroTile0",
     "MacroTile1",
     "WorkGroup0",
@@ -108,6 +108,7 @@
     "NumLoadsB",
     "NumLoadsPerpendicularA",
     "NumLoadsPerpendicularB",
+    "NumThreads",
     ]
 
 # dictionary of defaults comprised for 1st option for each parameter
diff --git a/Tensile/SolutionStructs.py b/Tensile/SolutionStructs.py
index 5f4eff66e..2f6b9d407 100644
--- a/Tensile/SolutionStructs.py
+++ b/Tensile/SolutionStructs.py
@@ -20,7 +20,7 @@
 ################################################################################
 
 
-from Common import globalParameters, defaultProblemType, assignParameterWithDefault, printExit, assignParameterRequired, defaultSolution, derrivedParameters
+from Common import globalParameters, defaultProblemType, assignParameterWithDefault, printExit, assignParameterRequired, defaultSolution, derivedParameters
 from copy import deepcopy
 
 ################################################################################
@@ -583,6 +583,7 @@ def assignProblemIndependentDerivedParameters(state):
     if "SplitU" in state and "LoopUnroll" in state:
       state["DepthU"] = state["SplitU"] * state["LoopUnroll"]
 
+    printReason = False
     # num threads
     state["NumThreads"] = state["WorkGroup0"]*state["WorkGroup1"]
     if state["NumThreads"] > globalParameters["MaxThreads"]:
@@ -625,14 +626,15 @@ def assignDerivedParameters(state):
       if printReason: print2("totalElementsA %u %% NumThreads %u != 0" \
           % (totalElementsA, state["NumThreads"]))
       state["Valid"] = False
-      #return
+      return
     else:
       state["NumLoadsA"] = totalElementsA / state["NumThreads"]
     if totalElementsB % state["NumThreads"] != 0:
       if printReason: print2("totalElementsB %u %% NumThreads %u != 0" \
           % (totalElementsB, state["NumThreads"]))
       state["Valid"] = False
-      #return
+      return
+      state["NumLoadsB"] = totalElementsB / state["NumThreads"]
     else:
       state["NumLoadsB"] = totalElementsB / state["NumThreads"]
 
@@ -643,7 +645,7 @@ def assignDerivedParameters(state):
       if printReason: print2("numLoadsA %u %% numLoadsParaA %u != 0" \
           % (state["NumLoadsA"], state["NumLoadsCoalescedA"]))
       state["Valid"] = False
-      #return
+      return
     else:
       state["NumLoadsPerpendicularA"] = state["NumLoadsA"] \
           / state["NumLoadsCoalescedA"]
@@ -653,7 +655,7 @@ def assignDerivedParameters(state):
       if printReason: print2("numLoadsB %u %% numLoadsParaB %u != 0" \
           % (state["NumLoadsB"], state["NumLoadsCoalescedB"]))
       state["Valid"] = False
-      #return
+      return
     else:
       state["NumLoadsPerpendicularB"] = state["NumLoadsB"] \
           / state["NumLoadsCoalescedB"]
@@ -663,14 +665,14 @@ def assignDerivedParameters(state):
       if printReason: print2("totalElementsParaA %u %% numLoadsParaA %u != 0" \
           % (totalElementsParaA, state["NumLoadsCoalescedA"]))
       state["Valid"] = False
-      #return
+      return
     #else:
     #  loadSizeParaA = totalElementsParaA / state["NumLoadsCoalescedA"]
     if totalElementsPerpA % state["NumLoadsPerpendicularA"] != 0:
       if printReason: print2("totalElementsPerpA %u %% numLoadsPerpA %u != 0" \
           % (totalElementsPerpA, state["NumLoadsPerpendicularA"]))
       state["Valid"] = False
-      #return
+      return
     #else:
     #  loadSizePerpA = totalElementsPerpA / state["NumLoadsPerpendicularA"]
 
@@ -679,14 +681,14 @@ def assignDerivedParameters(state):
       if printReason: print2("totalElementsParaB %u %% numLoadsParaB %u != 0" \
           % (totalElementsParaB, state["NumLoadsCoalescedB"]))
       state["Valid"] = False
-      #return
+      return
     #else:
     #  loadSizeParaB = totalElementsParaB / state["NumLoadsCoalescedB"]
     if totalElementsPerpB % state["NumLoadsPerpendicularB"] != 0:
       if printReason: print2("totalElementsPerpB %u %% numLoadsPerpB %u != 0" \
           % (totalElementsPerpB, state["NumLoadsPerpendicularB"]))
       state["Valid"] = False
-      #return
+      return
     #else:
     #  loadSizePerpB = totalElementsPerpB / state["NumLoadsPerpendicularB"]
 
@@ -698,7 +700,7 @@ def assignDerivedParameters(state):
     if sizeLDS > globalParameters["MaxLDS"]:
       if printReason: print2("Kernel Uses %u > %u bytes" % ( sizeLDS, globalParameters["MaxLDS"]))
       state["Valid"] = False
-      #return
+      return
 
     # Compiler may be causing incorrect spills on ROCm1.4 from DT on 2/21/17
     if globalParameters["Backend"] == "HIP":
@@ -745,14 +747,16 @@ def getMinNaming(objs):
     # only 1, rather than name being nothing, it'll be everything
     if len(objs) == 1:
       for key in keys:
-        requiredParameters[key] = False
+        if key not in derivedParameters:
+          requiredParameters[key] = False
     else:
       for key in keys:
         required = False
-        for i in range(1, len(objs)):
-          if objs[0][key] != objs[i][key]:
-            required = True
-            break
+        if key not in derivedParameters:
+          for i in range(1, len(objs)):
+            if objs[0][key] != objs[i][key]:
+              required = True
+              break
         if required:
           requiredParameters[key] = True
         else:
@@ -799,7 +803,7 @@ def getSerialNaming(objs):
     for objIdx in range(0, len(objs)):
       obj = objs[objIdx]
       for paramName in sorted(obj.keys()):
-        if paramName not in derrivedParameters:
+        if paramName not in derivedParameters:
           paramValue = obj[paramName]
           if paramName in data:
             if paramValue not in data[paramName]:
@@ -823,7 +827,7 @@ def getNameSerial(state, serialNaming):
     serial = 0
     multiplier = 1
     for paramName in sorted(state.keys()):
-      if paramName not in derrivedParameters:
+      if paramName not in derivedParameters:
         paramValue = state[paramName]
         paramData = data[paramName]
         paramNameMultiplier = len(paramData)
diff --git a/Tensile/__init__.py b/Tensile/__init__.py
index 4eabd0b3f..e835b9d02 100644
--- a/Tensile/__init__.py
+++ b/Tensile/__init__.py
@@ -1 +1 @@
-__version__ = "2.1.2"
+__version__ = "2.1.3"

From 2e6e67d980c1e4c96fa3ef2642bbb44fd1dc9a0b Mon Sep 17 00:00:00 2001
From: David Tanner <guacamoleo@gmail.com>
Date: Fri, 24 Feb 2017 14:14:28 -0600
Subject: [PATCH 05/21] nlca -1, 1 search for next valid parameter, set as
 default

---
 Tensile/BenchmarkProblems.py            |   9 +-
 Tensile/Common.py                       |   8 +-
 Tensile/Configs/rocblas_cgemm.yaml      |  16 +--
 Tensile/Configs/rocblas_dgemm.yaml      |  16 +--
 Tensile/Configs/rocblas_sgemm.yaml      |  16 +--
 Tensile/Configs/rocblas_zgemm.yaml      |  16 +--
 Tensile/Configs/sgemm.yaml              |   5 +-
 Tensile/Configs/sgemm_5760.yaml         |   8 --
 Tensile/Configs/tensor_contraction.yaml |   4 +-
 Tensile/SolutionStructs.py              | 165 +++++++++++++++++++-----
 Tensile/__init__.py                     |   2 +-
 11 files changed, 177 insertions(+), 88 deletions(-)

diff --git a/Tensile/BenchmarkProblems.py b/Tensile/BenchmarkProblems.py
index cd0bbc575..178acf213 100644
--- a/Tensile/BenchmarkProblems.py
+++ b/Tensile/BenchmarkProblems.py
@@ -175,10 +175,13 @@ def benchmarkProblemType( config ):
             for hardcodedSolution in hardcodedSolutions:
               if hardcodedSolution == solutionObject:
                 hasSolution = True
-          if not hasSolution:
+          if hasSolution:
+            if globalParameters["PrintLevel"] >= 1:
+              sys.stdout.write(":")
+          else:
             solutions[hardcodedIdx].append(solutionObject)
-          if globalParameters["PrintLevel"] >= 1:
-            sys.stdout.write("|")
+            if globalParameters["PrintLevel"] >= 1:
+              sys.stdout.write("|")
         else:
           if globalParameters["PrintLevel"] >= 1:
             sys.stdout.write(".")
diff --git a/Tensile/Common.py b/Tensile/Common.py
index 254985c7a..2cf0e0906 100644
--- a/Tensile/Common.py
+++ b/Tensile/Common.py
@@ -67,6 +67,8 @@
     {"EdgeType":                [ "Branch" ] }, # Shift
     {"EdgeMultiKernel":         [ False ] },
     {"PadLDS":                  [ 1 ] },
+    {"SplitU":                  [ 1 ] },
+    {"Prefetch":                [ False ] },
     ]
 # benchmark these solution independently
 defaultForkParameters = [
@@ -74,8 +76,8 @@
     {"WorkGroupShape":          [ 0 ] }, # -1, 0, 1
     {"ThreadTileEdge":          [ 1, 2, 4, 6, 8 ] },
     {"ThreadTileShape":         [ 0 ] }, # -1, 0, 1
-    {"SplitU":                  [ 1 ] },
-    {"Prefetch":                [ False ] },
+    {"NumLoadsCoalescedA":       [ 1, -1 ] },
+    {"NumLoadsCoalescedB":       [ 1, -1 ] },
     ]
 # keep one winner per solution and it affects which will win
 defaultBenchmarkForkParameters = [
@@ -88,8 +90,6 @@
     ]
 # keep one winner per solution and it would affect which solutions fastest
 defaultBenchmarkJoinParameters = [
-    {"NumLoadsCoalescedA":       [ 1, 2, 3, 4, 6, 8 ] },
-    {"NumLoadsCoalescedB":       [ 1, 2, 3, 4, 6, 8 ] },
     {"VectorWidthGlobalLoad":   [ 4 ] },
     {"VectorWidthGlobalStore":  [ 4 ] },
     {"VectorWidthLocalLoad":    [ 4 ] },
diff --git a/Tensile/Configs/rocblas_cgemm.yaml b/Tensile/Configs/rocblas_cgemm.yaml
index b1c5e2257..3c0fbd958 100644
--- a/Tensile/Configs/rocblas_cgemm.yaml
+++ b/Tensile/Configs/rocblas_cgemm.yaml
@@ -41,8 +41,8 @@ BenchmarkProblems:
       - ThreadTileShape: [ 0 ]
       - WorkGroupEdge: [8, 16]
       - ThreadTileEdge: [2, 4, 6, 8]
-      - NumLoadsCoalescedA: [ 1, 2, 3, 4, 6, 8 ]
-      - NumLoadsCoalescedB: [ 1, 2, 3, 4, 6, 8 ]
+      - NumLoadsCoalescedA: [ 1, -1 ]
+      - NumLoadsCoalescedB: [ 1, -1 ]
       - LoopUnroll: [8, 16]
     BenchmarkForkParameters:
     JoinParameters:
@@ -74,8 +74,8 @@ BenchmarkProblems:
       - WorkGroupShape: [ 0 ]
       - ThreadTileEdge: [2, 4, 6, 8]
       - ThreadTileShape: [ 0 ]
-      - NumLoadsCoalescedA: [ 1, 2, 3, 4, 6, 8 ]
-      - NumLoadsCoalescedB: [ 1, 2, 3, 4, 6, 8 ]
+      - NumLoadsCoalescedA: [ 1, -1 ]
+      - NumLoadsCoalescedB: [ 1, -1 ]
       - LoopUnroll: [8, 16]
     BenchmarkForkParameters:
     JoinParameters:
@@ -107,8 +107,8 @@ BenchmarkProblems:
       - WorkGroupShape: [ 0 ]
       - ThreadTileEdge: [2, 4, 6, 8]
       - ThreadTileShape: [ 0 ]
-      - NumLoadsCoalescedA: [ 1, 2, 3, 4, 6, 8 ]
-      - NumLoadsCoalescedB: [ 1, 2, 3, 4, 6, 8 ]
+      - NumLoadsCoalescedA: [ 1, -1 ]
+      - NumLoadsCoalescedB: [ 1, -1 ]
       - LoopUnroll: [8, 16]
     BenchmarkForkParameters:
     JoinParameters:
@@ -140,8 +140,8 @@ BenchmarkProblems:
       - WorkGroupShape: [ 0 ]
       - ThreadTileEdge: [2, 4, 6, 8]
       - ThreadTileShape: [ 0 ]
-      - NumLoadsCoalescedA: [ 1, 2, 3, 4, 6, 8 ]
-      - NumLoadsCoalescedB: [ 1, 2, 3, 4, 6, 8 ]
+      - NumLoadsCoalescedA: [ 1, -1 ]
+      - NumLoadsCoalescedB: [ 1, -1 ]
       - LoopUnroll: [8, 16]
     BenchmarkForkParameters:
     JoinParameters:
diff --git a/Tensile/Configs/rocblas_dgemm.yaml b/Tensile/Configs/rocblas_dgemm.yaml
index 36e358135..f4bc58257 100644
--- a/Tensile/Configs/rocblas_dgemm.yaml
+++ b/Tensile/Configs/rocblas_dgemm.yaml
@@ -41,8 +41,8 @@ BenchmarkProblems:
       - ThreadTileShape: [ 0 ]
       - WorkGroupEdge: [8, 16]
       - ThreadTileEdge: [2, 4, 6, 8]
-      - NumLoadsCoalescedA: [ 1, 2, 3, 4, 6, 8 ]
-      - NumLoadsCoalescedB: [ 1, 2, 3, 4, 6, 8 ]
+      - NumLoadsCoalescedA: [ 1, -1 ]
+      - NumLoadsCoalescedB: [ 1, -1 ]
       - LoopUnroll: [8, 16]
     BenchmarkForkParameters:
     JoinParameters:
@@ -74,8 +74,8 @@ BenchmarkProblems:
       - WorkGroupShape: [ 0 ]
       - ThreadTileEdge: [2, 4, 6, 8]
       - ThreadTileShape: [ 0 ]
-      - NumLoadsCoalescedA: [ 1, 2, 3, 4, 6, 8 ]
-      - NumLoadsCoalescedB: [ 1, 2, 3, 4, 6, 8 ]
+      - NumLoadsCoalescedA: [ 1, -1 ]
+      - NumLoadsCoalescedB: [ 1, -1 ]
       - LoopUnroll: [8, 16]
     BenchmarkForkParameters:
     JoinParameters:
@@ -107,8 +107,8 @@ BenchmarkProblems:
       - WorkGroupShape: [ 0 ]
       - ThreadTileEdge: [2, 4, 6, 8]
       - ThreadTileShape: [ 0, 1 ]
-      - NumLoadsCoalescedA: [ 1, 2, 3, 4, 6, 8 ]
-      - NumLoadsCoalescedB: [ 1, 2, 3, 4, 6, 8 ]
+      - NumLoadsCoalescedA: [ 1, -1 ]
+      - NumLoadsCoalescedB: [ 1, -1 ]
       - LoopUnroll: [8, 16]
     BenchmarkForkParameters:
     JoinParameters:
@@ -140,8 +140,8 @@ BenchmarkProblems:
       - WorkGroupShape: [ 0 ]
       - ThreadTileEdge: [2, 4, 6, 8]
       - ThreadTileShape: [ 0 ]
-      - NumLoadsCoalescedA: [ 1, 2, 3, 4, 6, 8 ]
-      - NumLoadsCoalescedB: [ 1, 2, 3, 4, 6, 8 ]
+      - NumLoadsCoalescedA: [ 1, -1 ]
+      - NumLoadsCoalescedB: [ 1, -1 ]
       - LoopUnroll: [8, 16]
     BenchmarkForkParameters:
     JoinParameters:
diff --git a/Tensile/Configs/rocblas_sgemm.yaml b/Tensile/Configs/rocblas_sgemm.yaml
index 346bf1cb7..fedec0deb 100644
--- a/Tensile/Configs/rocblas_sgemm.yaml
+++ b/Tensile/Configs/rocblas_sgemm.yaml
@@ -41,8 +41,8 @@ BenchmarkProblems:
       - ThreadTileShape: [ 0 ]
       - WorkGroupEdge: [8, 16]
       - ThreadTileEdge: [2, 4, 6, 8]
-      - NumLoadsCoalescedA: [ 1, 2, 3, 4, 6, 8 ]
-      - NumLoadsCoalescedB: [ 1, 2, 3, 4, 6, 8 ]
+      - NumLoadsCoalescedA: [ 1, -1 ]
+      - NumLoadsCoalescedB: [ 1, -1 ]
       - LoopUnroll: [8, 16]
     BenchmarkForkParameters:
     JoinParameters:
@@ -74,8 +74,8 @@ BenchmarkProblems:
       - WorkGroupShape: [ 0 ]
       - ThreadTileEdge: [2, 4, 6, 8]
       - ThreadTileShape: [ 0 ]
-      - NumLoadsCoalescedA: [ 1, 2, 3, 4, 6, 8 ]
-      - NumLoadsCoalescedB: [ 1, 2, 3, 4, 6, 8 ]
+      - NumLoadsCoalescedA: [ 1, -1 ]
+      - NumLoadsCoalescedB: [ 1, -1 ]
       - LoopUnroll: [8, 16]
     BenchmarkForkParameters:
     JoinParameters:
@@ -107,8 +107,8 @@ BenchmarkProblems:
       - WorkGroupShape: [ 0 ]
       - ThreadTileEdge: [2, 4, 6, 8]
       - ThreadTileShape: [ 0, 1 ]
-      - NumLoadsCoalescedA: [ 1, 2, 3, 4, 6, 8 ]
-      - NumLoadsCoalescedB: [ 1, 2, 3, 4, 6, 8 ]
+      - NumLoadsCoalescedA: [ 1, -1 ]
+      - NumLoadsCoalescedB: [ 1, -1 ]
       - LoopUnroll: [8, 16]
     BenchmarkForkParameters:
     JoinParameters:
@@ -140,8 +140,8 @@ BenchmarkProblems:
       - WorkGroupShape: [ 0 ]
       - ThreadTileEdge: [2, 4, 6, 8]
       - ThreadTileShape: [ 0 ]
-      - NumLoadsCoalescedA: [ 1, 2, 3, 4, 6, 8 ]
-      - NumLoadsCoalescedB: [ 1, 2, 3, 4, 6, 8 ]
+      - NumLoadsCoalescedA: [ 1, -1 ]
+      - NumLoadsCoalescedB: [ 1, -1 ]
       - LoopUnroll: [8, 16]
     BenchmarkForkParameters:
     JoinParameters:
diff --git a/Tensile/Configs/rocblas_zgemm.yaml b/Tensile/Configs/rocblas_zgemm.yaml
index badbd3dfb..64f0f6af9 100644
--- a/Tensile/Configs/rocblas_zgemm.yaml
+++ b/Tensile/Configs/rocblas_zgemm.yaml
@@ -41,8 +41,8 @@ BenchmarkProblems:
       - ThreadTileShape: [ 0 ]
       - WorkGroupEdge: [8, 16]
       - ThreadTileEdge: [2, 4, 6, 8]
-      - NumLoadsCoalescedA: [ 1, 2, 3, 4, 6, 8 ]
-      - NumLoadsCoalescedB: [ 1, 2, 3, 4, 6, 8 ]
+      - NumLoadsCoalescedA: [ 1, -1 ]
+      - NumLoadsCoalescedB: [ 1, -1 ]
       - LoopUnroll: [8, 16]
     BenchmarkForkParameters:
     JoinParameters:
@@ -74,8 +74,8 @@ BenchmarkProblems:
       - WorkGroupShape: [ 0 ]
       - ThreadTileEdge: [2, 4, 6, 8]
       - ThreadTileShape: [ 0 ]
-      - NumLoadsCoalescedA: [ 1, 2, 3, 4, 6, 8 ]
-      - NumLoadsCoalescedB: [ 1, 2, 3, 4, 6, 8 ]
+      - NumLoadsCoalescedA: [ 1, -1 ]
+      - NumLoadsCoalescedB: [ 1, -1 ]
       - LoopUnroll: [8, 16]
     BenchmarkForkParameters:
     JoinParameters:
@@ -107,8 +107,8 @@ BenchmarkProblems:
       - WorkGroupShape: [ 0 ]
       - ThreadTileEdge: [2, 4, 6, 8]
       - ThreadTileShape: [ 0 ]
-      - NumLoadsCoalescedA: [ 1, 2, 3, 4, 6, 8 ]
-      - NumLoadsCoalescedB: [ 1, 2, 3, 4, 6, 8 ]
+      - NumLoadsCoalescedA: [ 1, -1 ]
+      - NumLoadsCoalescedB: [ 1, -1 ]
       - LoopUnroll: [8, 16]
     BenchmarkForkParameters:
     JoinParameters:
@@ -140,8 +140,8 @@ BenchmarkProblems:
       - WorkGroupShape: [ 0 ]
       - ThreadTileEdge: [2, 4, 6, 8]
       - ThreadTileShape: [ 0 ]
-      - NumLoadsCoalescedA: [ 1, 2, 3, 4, 6, 8 ]
-      - NumLoadsCoalescedB: [ 1, 2, 3, 4, 6, 8 ]
+      - NumLoadsCoalescedA: [ 1, -1 ]
+      - NumLoadsCoalescedB: [ 1, -1 ]
       - LoopUnroll: [8, 16]
     BenchmarkForkParameters:
     JoinParameters:
diff --git a/Tensile/Configs/sgemm.yaml b/Tensile/Configs/sgemm.yaml
index 49e6d13c7..7d2805e4a 100644
--- a/Tensile/Configs/sgemm.yaml
+++ b/Tensile/Configs/sgemm.yaml
@@ -40,13 +40,10 @@ BenchmarkProblems:
     ForkParameters:
       - WorkGroupEdge: [8, 16]
       - ThreadTileEdge: [2, 4, 8 ]
-    BenchmarkForkParameters:
-      - NumLoadsCoalescedA: [ 1, 2, 3, 4, 6 ]
-      - NumLoadsCoalescedB: [ 1, 2, 3, 4, 6 ]
       - LoopUnroll: [8, 16]
+    BenchmarkForkParameters:
     JoinParameters:
       - MacroTile
-      - DepthU
     BenchmarkJoinParameters:
     BenchmarkFinalParameters:
       - ProblemSizes: [ [16, 128], [16, 128], [256] ]
diff --git a/Tensile/Configs/sgemm_5760.yaml b/Tensile/Configs/sgemm_5760.yaml
index d8f077b85..7c167a650 100644
--- a/Tensile/Configs/sgemm_5760.yaml
+++ b/Tensile/Configs/sgemm_5760.yaml
@@ -39,8 +39,6 @@ BenchmarkProblems:
       - ThreadTileShape: [0, 1]
       - WorkGroupEdge: [8, 16]
       - ThreadTileEdge: [ 8 ]
-      - NumLoadsCoalescedA: [ 1, 2, 3, 4, 6, 8 ]
-      - NumLoadsCoalescedB: [ 1, 2, 3, 4, 6, 8 ]
       - LoopUnroll: [8, 16]
     #BenchmarkForkParameters:
     JoinParameters:
@@ -71,8 +69,6 @@ BenchmarkProblems:
       - ThreadTileShape: [0, 1]
       - WorkGroupEdge: [8, 16]
       - ThreadTileEdge: [4, 6, 8]
-      - NumLoadsCoalescedA: [ 1, 2, 3, 4, 5, 6 ]
-      - NumLoadsCoalescedB: [ 1, 2, 3, 4, 5, 6 ]
       - LoopUnroll: [8, 16]
     #BenchmarkForkParameters:
     JoinParameters:
@@ -103,8 +99,6 @@ BenchmarkProblems:
       - ThreadTileShape: [0, 1]
       - WorkGroupEdge: [8, 16]
       - ThreadTileEdge: [4, 6, 8]
-      - NumLoadsCoalescedA: [ 1, 2, 3, 4, 5, 6 ]
-      - NumLoadsCoalescedB: [ 1, 2, 3, 4, 5, 6 ]
       - LoopUnroll: [8, 16]
     #BenchmarkForkParameters:
     JoinParameters:
@@ -135,8 +129,6 @@ BenchmarkProblems:
       - ThreadTileShape: [0, 1]
       - WorkGroupEdge: [8, 16]
       - ThreadTileEdge: [4, 6, 8]
-      - NumLoadsCoalescedA: [ 1, 2, 3, 4, 5, 6 ]
-      - NumLoadsCoalescedB: [ 1, 2, 3, 4, 5, 6 ]
       - LoopUnroll: [8, 16]
     #BenchmarkForkParameters:
     JoinParameters:
diff --git a/Tensile/Configs/tensor_contraction.yaml b/Tensile/Configs/tensor_contraction.yaml
index dd44124d7..8201d0bec 100644
--- a/Tensile/Configs/tensor_contraction.yaml
+++ b/Tensile/Configs/tensor_contraction.yaml
@@ -40,10 +40,8 @@ BenchmarkProblems:
     ForkParameters:
       - WorkGroupEdge: [8, 16]
       - ThreadTileEdge: [2, 4, 8 ]
-    BenchmarkForkParameters:
-      - NumLoadsCoalescedA: [ 1, 2, 3, 4, 6 ]
-      - NumLoadsCoalescedB: [ 1, 2, 3, 4, 6 ]
       - LoopUnroll: [8, 16]
+    BenchmarkForkParameters:
     JoinParameters:
       - MacroTile
       - DepthU
diff --git a/Tensile/SolutionStructs.py b/Tensile/SolutionStructs.py
index 2f6b9d407..c83d28fed 100644
--- a/Tensile/SolutionStructs.py
+++ b/Tensile/SolutionStructs.py
@@ -606,20 +606,20 @@ def assignDerivedParameters(state):
 
     # how many elements to load
     if state["ProblemType"]["TLUA"]:
-      totalElementsParaA = state["MacroTile0"]
+      totalElementsCoalescedA = state["MacroTile0"]
       totalElementsPerpA = state["LoopUnroll"]
     else:
-      totalElementsParaA = state["LoopUnroll"]
+      totalElementsCoalescedA = state["LoopUnroll"]
       totalElementsPerpA = state["MacroTile0"]
 
     if state["ProblemType"]["TLUB"]:
-      totalElementsParaB = state["MacroTile1"]
+      totalElementsCoalescedB = state["MacroTile1"]
       totalElementsPerpB = state["LoopUnroll"]
     else:
-      totalElementsParaB = state["LoopUnroll"]
+      totalElementsCoalescedB = state["LoopUnroll"]
       totalElementsPerpB = state["MacroTile1"]
-    totalElementsA = totalElementsParaA * totalElementsPerpA
-    totalElementsB = totalElementsParaB * totalElementsPerpB
+    totalElementsA = totalElementsCoalescedA * totalElementsPerpA
+    totalElementsB = totalElementsCoalescedB * totalElementsPerpB
 
     # how many load instructions
     if totalElementsA % state["NumThreads"] != 0:
@@ -638,17 +638,122 @@ def assignDerivedParameters(state):
     else:
       state["NumLoadsB"] = totalElementsB / state["NumThreads"]
 
-    # how many loads para
-    if state["NumLoadsCoalescedA"] < 1:
-      state["NumLoadsCoalescedA"] = state["NumLoadsA"]
-    if state["NumLoadsA"] % state["NumLoadsCoalescedA"] != 0:
-      if printReason: print2("numLoadsA %u %% numLoadsParaA %u != 0" \
-          % (state["NumLoadsA"], state["NumLoadsCoalescedA"]))
-      state["Valid"] = False
-      return
+    # nlca = 1
+    if state["NumLoadsCoalescedA"] == 1:
+      foundValid = False
+      for nlca in range(1, state["NumLoadsA"]+1):
+        nlpa = state["NumLoadsA"] / nlca
+        if state["NumLoadsA"] % nlca == 0 \
+            and totalElementsCoalescedA % nlca == 0 \
+            and totalElementsPerpA % nlpa == 0:
+          state["NumLoadsCoalescedA"] = nlca
+          state["NumLoadsPerpendicularA"] = nlpa
+          foundValid = True
+          break
+      if not foundValid:
+        state["Valid"] = False
+        return
+
+    # nlca = -1
+    elif state["NumLoadsCoalescedA"] == -1:
+      foundValid = False
+      for nlca in range(state["NumLoadsA"], 0, -1):
+        nlpa = state["NumLoadsA"] / nlca
+        if state["NumLoadsA"] % nlca == 0 \
+            and totalElementsCoalescedA % nlca == 0 \
+            and totalElementsPerpA % nlpa == 0:
+          state["NumLoadsCoalescedA"] = nlca
+          state["NumLoadsPerpendicularA"] = nlpa
+          foundValid = True
+          break
+      if not foundValid:
+        state["Valid"] = False
+        return
+
+    # nlca = other
     else:
       state["NumLoadsPerpendicularA"] = state["NumLoadsA"] \
           / state["NumLoadsCoalescedA"]
+
+      if state["NumLoadsA"] % state["NumLoadsCoalescedA"] != 0:
+        if printReason: print2("numLoadsA %u %% numLoadsParaA %u != 0" \
+            % (state["NumLoadsA"], state["NumLoadsCoalescedA"]))
+        state["Valid"] = False
+      if totalElementsCoalescedA % state["NumLoadsCoalescedA"] != 0:
+        if printReason: print2("totalElementsCoalescedA %u %% numLoadsParaA %u != 0" \
+            % (totalElementsCoalescedA, state["NumLoadsCoalescedA"]))
+        state["Valid"] = False
+        return
+      if totalElementsPerpA % state["NumLoadsPerpendicularA"] != 0:
+        if printReason: print2("totalElementsPerpA %u %% numLoadsPerpA %u != 0" \
+            % (totalElementsPerpA, state["NumLoadsPerpendicularA"]))
+        state["Valid"] = False
+        return
+
+
+
+
+
+    # nlcb = 1
+    if state["NumLoadsCoalescedB"] == 1:
+      foundValid = False
+      for nlca in range(1, state["NumLoadsB"]+1):
+        nlpa = state["NumLoadsB"] / nlca
+        if state["NumLoadsB"] % nlca == 0 \
+            and totalElementsCoalescedB % nlca == 0 \
+            and totalElementsPerpB % nlpa == 0:
+          state["NumLoadsCoalescedB"] = nlca
+          state["NumLoadsPerpendicularB"] = nlpa
+          foundValid = True
+          break
+      if not foundValid:
+        state["Valid"] = False
+        return
+
+    # nlcb = -1
+    elif state["NumLoadsCoalescedB"] == -1:
+      foundValid = False
+      for nlca in range(state["NumLoadsB"], 0, -1):
+        nlpa = state["NumLoadsB"] / nlca
+        if state["NumLoadsB"] % nlca == 0 \
+            and totalElementsCoalescedB % nlca == 0 \
+            and totalElementsPerpB % nlpa == 0:
+          state["NumLoadsCoalescedB"] = nlca
+          state["NumLoadsPerpendicularB"] = nlpa
+          foundValid = True
+          break
+      if not foundValid:
+        state["Valid"] = False
+        return
+
+    # nlcb = other
+    else:
+      state["NumLoadsPerpendicularB"] = state["NumLoadsB"] \
+          / state["NumLoadsCoalescedB"]
+
+      if state["NumLoadsB"] % state["NumLoadsCoalescedB"] != 0:
+        if printReason: print2("numLoadsB %u %% numLoadsParaB %u != 0" \
+            % (state["NumLoadsB"], state["NumLoadsCoalescedB"]))
+        state["Valid"] = False
+      if totalElementsCoalescedB % state["NumLoadsCoalescedB"] != 0:
+        if printReason: print2("totalElementsCoalescedB %u %% numLoadsParaB %u != 0" \
+            % (totalElementsCoalescedB, state["NumLoadsCoalescedB"]))
+        state["Valid"] = False
+        return
+      if totalElementsPerpB % state["NumLoadsPerpendicularB"] != 0:
+        if printReason: print2("totalElementsPerpB %u %% numLoadsPerpB %u != 0" \
+            % (totalElementsPerpB, state["NumLoadsPerpendicularB"]))
+        state["Valid"] = False
+        return
+
+
+
+
+
+
+
+
+    """
     if state["NumLoadsCoalescedB"] < 1:
       state["NumLoadsCoalescedB"] = state["NumLoadsB"]
     if state["NumLoadsB"] % state["NumLoadsCoalescedB"] != 0:
@@ -660,30 +765,15 @@ def assignDerivedParameters(state):
       state["NumLoadsPerpendicularB"] = state["NumLoadsB"] \
           / state["NumLoadsCoalescedB"]
 
-    # load size para/perp A
-    if totalElementsParaA % state["NumLoadsCoalescedA"] != 0:
-      if printReason: print2("totalElementsParaA %u %% numLoadsParaA %u != 0" \
-          % (totalElementsParaA, state["NumLoadsCoalescedA"]))
-      state["Valid"] = False
-      return
-    #else:
-    #  loadSizeParaA = totalElementsParaA / state["NumLoadsCoalescedA"]
-    if totalElementsPerpA % state["NumLoadsPerpendicularA"] != 0:
-      if printReason: print2("totalElementsPerpA %u %% numLoadsPerpA %u != 0" \
-          % (totalElementsPerpA, state["NumLoadsPerpendicularA"]))
-      state["Valid"] = False
-      return
-    #else:
-    #  loadSizePerpA = totalElementsPerpA / state["NumLoadsPerpendicularA"]
 
     # load size para/perp B
-    if totalElementsParaB % state["NumLoadsCoalescedB"] != 0:
-      if printReason: print2("totalElementsParaB %u %% numLoadsParaB %u != 0" \
-          % (totalElementsParaB, state["NumLoadsCoalescedB"]))
+    if totalElementsCoalescedB % state["NumLoadsCoalescedB"] != 0:
+      if printReason: print2("totalElementsCoalescedB %u %% numLoadsParaB %u != 0" \
+          % (totalElementsCoalescedB, state["NumLoadsCoalescedB"]))
       state["Valid"] = False
       return
     #else:
-    #  loadSizeParaB = totalElementsParaB / state["NumLoadsCoalescedB"]
+    #  loadSizeParaB = totalElementsCoalescedB / state["NumLoadsCoalescedB"]
     if totalElementsPerpB % state["NumLoadsPerpendicularB"] != 0:
       if printReason: print2("totalElementsPerpB %u %% numLoadsPerpB %u != 0" \
           % (totalElementsPerpB, state["NumLoadsPerpendicularB"]))
@@ -691,6 +781,15 @@ def assignDerivedParameters(state):
       return
     #else:
     #  loadSizePerpB = totalElementsPerpB / state["NumLoadsPerpendicularB"]
+    """
+
+
+
+
+
+
+
+
 
     # too much LDS
     sizeLDS = state["LoopUnroll"] \
diff --git a/Tensile/__init__.py b/Tensile/__init__.py
index e835b9d02..df4be5e09 100644
--- a/Tensile/__init__.py
+++ b/Tensile/__init__.py
@@ -1 +1 @@
-__version__ = "2.1.3"
+__version__ = "2.1.4"

From 250bfb0c4122bc0d38faa27e1ef728fc373c0242 Mon Sep 17 00:00:00 2001
From: David Tanner <guacamoleo@gmail.com>
Date: Fri, 24 Feb 2017 14:30:02 -0600
Subject: [PATCH 06/21] faster addition of MANY solutions

---
 Tensile/BenchmarkProblems.py | 15 ++++++---------
 1 file changed, 6 insertions(+), 9 deletions(-)

diff --git a/Tensile/BenchmarkProblems.py b/Tensile/BenchmarkProblems.py
index 178acf213..62d9867b4 100644
--- a/Tensile/BenchmarkProblems.py
+++ b/Tensile/BenchmarkProblems.py
@@ -148,6 +148,7 @@ def benchmarkProblemType( config ):
     # Enumerate Solutions = Hardcoded * Benchmark
     ############################################################################
     sys.stdout.write("# Enumerating Solutions")
+    solutionSet = set() # avoid duplicates for nlca=-1, 1
     for hardcodedIdx in range(0, numHardcoded):
       solutions.append([])
       hardcodedParamDict = benchmarkStep.hardcodedParameters[hardcodedIdx]
@@ -170,18 +171,14 @@ def benchmarkProblemType( config ):
         # TODO check if solution matches problem size for exact tile kernels
         solutionObject = Solution(solution)
         if solutionObject["Valid"]:
-          hasSolution = False
-          for hardcodedSolutions in solutions:
-            for hardcodedSolution in hardcodedSolutions:
-              if hardcodedSolution == solutionObject:
-                hasSolution = True
-          if hasSolution:
-            if globalParameters["PrintLevel"] >= 1:
-              sys.stdout.write(":")
-          else:
+          if solutionObject not in solutionSet:
+            solutionSet.add(solutionObject)
             solutions[hardcodedIdx].append(solutionObject)
             if globalParameters["PrintLevel"] >= 1:
               sys.stdout.write("|")
+          else:
+            if globalParameters["PrintLevel"] >= 1:
+              sys.stdout.write(":")
         else:
           if globalParameters["PrintLevel"] >= 1:
             sys.stdout.write(".")

From 9db107768f1ee532d2e7530c5863c27797944e66 Mon Sep 17 00:00:00 2001
From: David Tanner <guacamoleo@gmail.com>
Date: Fri, 24 Feb 2017 15:15:52 -0600
Subject: [PATCH 07/21] fixed definition of workgroup and threadtile shapes,
 able to make even skinnier

---
 Tensile/BenchmarkStructs.py |  8 ++++----
 Tensile/SolutionStructs.py  | 16 ++++++++--------
 Tensile/__init__.py         |  2 +-
 3 files changed, 13 insertions(+), 13 deletions(-)

diff --git a/Tensile/BenchmarkStructs.py b/Tensile/BenchmarkStructs.py
index b119001ae..236f67eb9 100644
--- a/Tensile/BenchmarkStructs.py
+++ b/Tensile/BenchmarkStructs.py
@@ -399,13 +399,13 @@ def convertParametersToSteps(self):
           macroTileDim0 = workGroupEdgeValues[workGroupEdgeIdx]*threadTileEdgeValues[threadTileEdgeIdx]
           macroTileDim1 = macroTileDim0
           if workGroupShapeValues[workGroupShapeIdx] < 0:
-            macroTileDim1 /= 2
+            macroTileDim0 *= abs(workGroupShapeValues[workGroupShapeIdx])
           elif workGroupShapeValues[workGroupShapeIdx] > 0:
-            macroTileDim1 *= 2
+            macroTileDim1 *= abs(workGroupShapeValues[workGroupShapeIdx])
           if threadTileShapeValues[threadTileShapeIdx] < 0:
-            macroTileDim1 /= 2
+            macroTileDim0 *= abs(threadTileShapeValues[threadTileShapeIdx])
           elif threadTileShapeValues[threadTileShapeIdx] > 0:
-            macroTileDim1 *= 2
+            macroTileDim1 *= abs(threadTileShapeValues[threadTileShapeIdx])
           if macroTileDim0/macroTileDim1 <= self.initialSolutionParameters["MacroTileMaxRatio"] and macroTileDim1/macroTileDim0 <= self.initialSolutionParameters["MacroTileMaxRatio"]:
             macroTileJoinSet.add((macroTileDim0, macroTileDim1))
         totalPermutations *=len(macroTileJoinSet)
diff --git a/Tensile/SolutionStructs.py b/Tensile/SolutionStructs.py
index c83d28fed..eeeef16af 100644
--- a/Tensile/SolutionStructs.py
+++ b/Tensile/SolutionStructs.py
@@ -562,18 +562,18 @@ def assignProblemIndependentDerivedParameters(state):
     # workgroup sizes
     state["WorkGroup0"] = state["WorkGroupEdge"]
     state["WorkGroup1"] = state["WorkGroupEdge"]
-    if state["WorkGroupShape"] == 1:
-      state["WorkGroup1"] *= 2
-    elif state["WorkGroupShape"] == -1:
-      state["WorkGroup0"] *= 2
+    if state["WorkGroupShape"] > 0:
+      state["WorkGroup1"] *= abs(state["WorkGroupShape"])
+    elif state["WorkGroupShape"] < 0:
+      state["WorkGroup0"] *= abs(state["WorkGroupShape"])
 
     # thread tile sizes
     state["ThreadTile0"] = state["ThreadTileEdge"]
     state["ThreadTile1"] = state["ThreadTileEdge"]
-    if state["ThreadTileShape"] == 1:
-      state["ThreadTile1"] *= 2
-    elif state["ThreadTileShape"] == -1:
-      state["ThreadTile0"] *= 2
+    if state["ThreadTileShape"] > 0:
+      state["ThreadTile1"] *= abs(state["ThreadTileShape"])
+    elif state["ThreadTileShape"] < 0:
+      state["ThreadTile0"] *= abs(state["ThreadTileShape"])
 
     # macro tile sizes
     if "WorkGroup0" in state and "ThreadTile0" in state:
diff --git a/Tensile/__init__.py b/Tensile/__init__.py
index df4be5e09..0b167e619 100644
--- a/Tensile/__init__.py
+++ b/Tensile/__init__.py
@@ -1 +1 @@
-__version__ = "2.1.4"
+__version__ = "2.1.5"

From 83da8dbf7a460c250add4da285d6f67cfa9cd6fc Mon Sep 17 00:00:00 2001
From: David Tanner <guacamoleo@gmail.com>
Date: Fri, 24 Feb 2017 16:11:24 -0600
Subject: [PATCH 08/21] fixed shape definition

---
 Tensile/BenchmarkStructs.py | 8 ++++----
 Tensile/SolutionStructs.py  | 8 ++++----
 2 files changed, 8 insertions(+), 8 deletions(-)

diff --git a/Tensile/BenchmarkStructs.py b/Tensile/BenchmarkStructs.py
index 236f67eb9..86742be6c 100644
--- a/Tensile/BenchmarkStructs.py
+++ b/Tensile/BenchmarkStructs.py
@@ -399,13 +399,13 @@ def convertParametersToSteps(self):
           macroTileDim0 = workGroupEdgeValues[workGroupEdgeIdx]*threadTileEdgeValues[threadTileEdgeIdx]
           macroTileDim1 = macroTileDim0
           if workGroupShapeValues[workGroupShapeIdx] < 0:
-            macroTileDim0 *= abs(workGroupShapeValues[workGroupShapeIdx])
+            macroTileDim0 *= 2*abs(workGroupShapeValues[workGroupShapeIdx])
           elif workGroupShapeValues[workGroupShapeIdx] > 0:
-            macroTileDim1 *= abs(workGroupShapeValues[workGroupShapeIdx])
+            macroTileDim1 *= 2*abs(workGroupShapeValues[workGroupShapeIdx])
           if threadTileShapeValues[threadTileShapeIdx] < 0:
-            macroTileDim0 *= abs(threadTileShapeValues[threadTileShapeIdx])
+            macroTileDim0 *= 2*abs(threadTileShapeValues[threadTileShapeIdx])
           elif threadTileShapeValues[threadTileShapeIdx] > 0:
-            macroTileDim1 *= abs(threadTileShapeValues[threadTileShapeIdx])
+            macroTileDim1 *= 2*abs(threadTileShapeValues[threadTileShapeIdx])
           if macroTileDim0/macroTileDim1 <= self.initialSolutionParameters["MacroTileMaxRatio"] and macroTileDim1/macroTileDim0 <= self.initialSolutionParameters["MacroTileMaxRatio"]:
             macroTileJoinSet.add((macroTileDim0, macroTileDim1))
         totalPermutations *=len(macroTileJoinSet)
diff --git a/Tensile/SolutionStructs.py b/Tensile/SolutionStructs.py
index eeeef16af..bcaa80acd 100644
--- a/Tensile/SolutionStructs.py
+++ b/Tensile/SolutionStructs.py
@@ -563,17 +563,17 @@ def assignProblemIndependentDerivedParameters(state):
     state["WorkGroup0"] = state["WorkGroupEdge"]
     state["WorkGroup1"] = state["WorkGroupEdge"]
     if state["WorkGroupShape"] > 0:
-      state["WorkGroup1"] *= abs(state["WorkGroupShape"])
+      state["WorkGroup1"] *= 2*abs(state["WorkGroupShape"])
     elif state["WorkGroupShape"] < 0:
-      state["WorkGroup0"] *= abs(state["WorkGroupShape"])
+      state["WorkGroup0"] *= 2*abs(state["WorkGroupShape"])
 
     # thread tile sizes
     state["ThreadTile0"] = state["ThreadTileEdge"]
     state["ThreadTile1"] = state["ThreadTileEdge"]
     if state["ThreadTileShape"] > 0:
-      state["ThreadTile1"] *= abs(state["ThreadTileShape"])
+      state["ThreadTile1"] *= 2*abs(state["ThreadTileShape"])
     elif state["ThreadTileShape"] < 0:
-      state["ThreadTile0"] *= abs(state["ThreadTileShape"])
+      state["ThreadTile0"] *= 2*abs(state["ThreadTileShape"])
 
     # macro tile sizes
     if "WorkGroup0" in state and "ThreadTile0" in state:

From e87d18dbc0ffc52b10a1ce94750e29833d1eb0ec Mon Sep 17 00:00:00 2001
From: David Tanner <guacamoleo@gmail.com>
Date: Fri, 24 Feb 2017 16:29:00 -0600
Subject: [PATCH 09/21] min threads

---
 Tensile/BenchmarkStructs.py |   8 +--
 Tensile/Common.py           |   5 +-
 Tensile/SolutionStructs.py  |  11 ++--
 Tensile/SolutionWriter.py   | 124 ------------------------------------
 4 files changed, 14 insertions(+), 134 deletions(-)

diff --git a/Tensile/BenchmarkStructs.py b/Tensile/BenchmarkStructs.py
index 86742be6c..236f67eb9 100644
--- a/Tensile/BenchmarkStructs.py
+++ b/Tensile/BenchmarkStructs.py
@@ -399,13 +399,13 @@ def convertParametersToSteps(self):
           macroTileDim0 = workGroupEdgeValues[workGroupEdgeIdx]*threadTileEdgeValues[threadTileEdgeIdx]
           macroTileDim1 = macroTileDim0
           if workGroupShapeValues[workGroupShapeIdx] < 0:
-            macroTileDim0 *= 2*abs(workGroupShapeValues[workGroupShapeIdx])
+            macroTileDim0 *= abs(workGroupShapeValues[workGroupShapeIdx])
           elif workGroupShapeValues[workGroupShapeIdx] > 0:
-            macroTileDim1 *= 2*abs(workGroupShapeValues[workGroupShapeIdx])
+            macroTileDim1 *= abs(workGroupShapeValues[workGroupShapeIdx])
           if threadTileShapeValues[threadTileShapeIdx] < 0:
-            macroTileDim0 *= 2*abs(threadTileShapeValues[threadTileShapeIdx])
+            macroTileDim0 *= abs(threadTileShapeValues[threadTileShapeIdx])
           elif threadTileShapeValues[threadTileShapeIdx] > 0:
-            macroTileDim1 *= 2*abs(threadTileShapeValues[threadTileShapeIdx])
+            macroTileDim1 *= abs(threadTileShapeValues[threadTileShapeIdx])
           if macroTileDim0/macroTileDim1 <= self.initialSolutionParameters["MacroTileMaxRatio"] and macroTileDim1/macroTileDim0 <= self.initialSolutionParameters["MacroTileMaxRatio"]:
             macroTileJoinSet.add((macroTileDim0, macroTileDim1))
         totalPermutations *=len(macroTileJoinSet)
diff --git a/Tensile/Common.py b/Tensile/Common.py
index 2cf0e0906..08e85b586 100644
--- a/Tensile/Common.py
+++ b/Tensile/Common.py
@@ -49,6 +49,7 @@
 globalParameters["DataInitType"] = 0 # 0=rand, 1=1, 2=serial
 # protect against invalid kernel
 globalParameters["MaxThreads"] = 256
+globalParameters["MinThreads"] = 64
 globalParameters["MaxRegisters"] = 256
 globalParameters["MaxLDS"] = 32768
 
@@ -73,9 +74,9 @@
 # benchmark these solution independently
 defaultForkParameters = [
     {"WorkGroupEdge":           [ 16, 8 ] },
-    {"WorkGroupShape":          [ 0 ] }, # -1, 0, 1
+    {"WorkGroupShape":          [ 0 ] }, # -4, -2, 0, 2, 4
     {"ThreadTileEdge":          [ 1, 2, 4, 6, 8 ] },
-    {"ThreadTileShape":         [ 0 ] }, # -1, 0, 1
+    {"ThreadTileShape":         [ 0 ] }, # -4, -2, 0, 2, 4
     {"NumLoadsCoalescedA":       [ 1, -1 ] },
     {"NumLoadsCoalescedB":       [ 1, -1 ] },
     ]
diff --git a/Tensile/SolutionStructs.py b/Tensile/SolutionStructs.py
index bcaa80acd..e4a853804 100644
--- a/Tensile/SolutionStructs.py
+++ b/Tensile/SolutionStructs.py
@@ -563,17 +563,17 @@ def assignProblemIndependentDerivedParameters(state):
     state["WorkGroup0"] = state["WorkGroupEdge"]
     state["WorkGroup1"] = state["WorkGroupEdge"]
     if state["WorkGroupShape"] > 0:
-      state["WorkGroup1"] *= 2*abs(state["WorkGroupShape"])
+      state["WorkGroup1"] *= abs(state["WorkGroupShape"])
     elif state["WorkGroupShape"] < 0:
-      state["WorkGroup0"] *= 2*abs(state["WorkGroupShape"])
+      state["WorkGroup0"] *= abs(state["WorkGroupShape"])
 
     # thread tile sizes
     state["ThreadTile0"] = state["ThreadTileEdge"]
     state["ThreadTile1"] = state["ThreadTileEdge"]
     if state["ThreadTileShape"] > 0:
-      state["ThreadTile1"] *= 2*abs(state["ThreadTileShape"])
+      state["ThreadTile1"] *= abs(state["ThreadTileShape"])
     elif state["ThreadTileShape"] < 0:
-      state["ThreadTile0"] *= 2*abs(state["ThreadTileShape"])
+      state["ThreadTile0"] *= abs(state["ThreadTileShape"])
 
     # macro tile sizes
     if "WorkGroup0" in state and "ThreadTile0" in state:
@@ -589,6 +589,9 @@ def assignProblemIndependentDerivedParameters(state):
     if state["NumThreads"] > globalParameters["MaxThreads"]:
       if printReason: print2("rejecting %u threads" % state["NumThreads"])
       state["Valid"] = False
+    if state["NumThreads"] < globalParameters["MinThreads"]:
+      if printReason: print2("rejecting %u threads" % state["NumThreads"])
+      state["Valid"] = False
     state["AssignedProblemIndependentDerivedParameters"] = True
 
   ########################################
diff --git a/Tensile/SolutionWriter.py b/Tensile/SolutionWriter.py
index 452c9c45e..ea7585b57 100644
--- a/Tensile/SolutionWriter.py
+++ b/Tensile/SolutionWriter.py
@@ -787,127 +787,3 @@ def getHeaderFileString(self, solution):
     return fileStr
 
 
-  ##############################################################################
-  # are solution parameters (dict) self-consistent
-  ##############################################################################
-  @ staticmethod
-  def solutionParametersConsistent(solution):
-    printReason = False
-
-    numThreads = solution["WorkGroup0"]*solution["WorkGroup1"]
-    if numThreads > globalParameters["MaxThreads"]:
-      if printReason: print2("rejecting %u threads" % numThreads)
-      return False
-
-    # how many elements to load
-    if solution["ProblemType"]["TLUA"]:
-      totalElementsParaA = solution["MacroTile0"]
-      totalElementsPerpA = solution["LoopUnroll"]
-    else:
-      totalElementsParaA = solution["LoopUnroll"]
-      totalElementsPerpA = solution["MacroTile0"]
-
-    if solution["ProblemType"]["TLUB"]:
-      totalElementsParaB = solution["MacroTile1"]
-      totalElementsPerpB = solution["LoopUnroll"]
-    else:
-      totalElementsParaB = solution["LoopUnroll"]
-      totalElementsPerpB = solution["MacroTile1"]
-    totalElementsA = totalElementsParaA * totalElementsPerpA
-    totalElementsB = totalElementsParaB * totalElementsPerpB
-
-    # how many load instructions
-    if totalElementsA % numThreads != 0:
-      if printReason: print2("totalElementsA %u %% numThreads %u != 0" \
-          % (totalElementsA, numThreads))
-      return False
-    else:
-      solution["NumLoadsA"] = totalElementsA / numThreads
-    if totalElementsB % numThreads != 0:
-      if printReason: print2("totalElementsB %u %% numThreads %u != 0" \
-          % (totalElementsB, numThreads))
-      return False
-    else:
-      solution["NumLoadsB"] = totalElementsB / numThreads
-
-    # how many loads para
-    if solution["NumLoadsCoalescedA"] < 1:
-      solution["NumLoadsCoalescedA"] = solution["NumLoadsA"]
-    if solution["NumLoadsA"] % solution["NumLoadsCoalescedA"] != 0:
-      if printReason: print2("numLoadsA %u %% numLoadsParaA %u != 0" \
-          % (solution["NumLoadsA"], solution["NumLoadsCoalescedA"]))
-      return False
-    else:
-      solution["NumLoadsPerpendicularA"] = solution["NumLoadsA"] \
-          / solution["NumLoadsCoalescedA"]
-    if solution["NumLoadsCoalescedB"] < 1:
-      solution["NumLoadsCoalescedB"] = solution["NumLoadsB"]
-    if solution["NumLoadsB"] % solution["NumLoadsCoalescedB"] != 0:
-      if printReason: print2("numLoadsB %u %% numLoadsParaB %u != 0" \
-          % (solution["NumLoadsB"], solution["NumLoadsCoalescedB"]))
-      return False
-    else:
-      solution["NumLoadsPerpendicularB"] = solution["NumLoadsB"] \
-          / solution["NumLoadsCoalescedB"]
-
-    # load size para/perp A
-    if totalElementsParaA % solution["NumLoadsCoalescedA"] != 0:
-      if printReason: print2("totalElementsParaA %u %% numLoadsParaA %u != 0" \
-          % (totalElementsParaA, solution["NumLoadsCoalescedA"]))
-      return False
-    #else:
-    #  loadSizeParaA = totalElementsParaA / solution["NumLoadsCoalescedA"]
-    if totalElementsPerpA % solution["NumLoadsPerpendicularA"] != 0:
-      if printReason: print2("totalElementsPerpA %u %% numLoadsPerpA %u != 0" \
-          % (totalElementsPerpA, solution["NumLoadsPerpendicularA"]))
-      return False
-    #else:
-    #  loadSizePerpA = totalElementsPerpA / solution["NumLoadsPerpendicularA"]
-
-    # load size para/perp B
-    if totalElementsParaB % solution["NumLoadsCoalescedB"] != 0:
-      if printReason: print2("totalElementsParaB %u %% numLoadsParaB %u != 0" \
-          % (totalElementsParaB, solution["NumLoadsCoalescedB"]))
-      return False
-    #else:
-    #  loadSizeParaB = totalElementsParaB / solution["NumLoadsCoalescedB"]
-    if totalElementsPerpB % solution["NumLoadsPerpendicularB"] != 0:
-      if printReason: print2("totalElementsPerpB %u %% numLoadsPerpB %u != 0" \
-          % (totalElementsPerpB, solution["NumLoadsPerpendicularB"]))
-      return False
-    #else:
-    #  loadSizePerpB = totalElementsPerpB / solution["NumLoadsPerpendicularB"]
-
-    # too much LDS
-    sizeLDS = solution["LoopUnroll"] \
-        * (solution["PadLDS"] * 2 + solution["MacroTile0"] \
-        + solution["MacroTile1"] ) \
-        * solution["ProblemType"]["DataType"].numBytes()
-    if sizeLDS > globalParameters["MaxLDS"]:
-      if printReason: print2("Kernel Uses %u > %u bytes" % ( sizeLDS, globalParameters["MaxLDS"]))
-      return False
-
-    # Compiler may be causing incorrect spills on ROCm1.4 from DT on 2/21/17
-    if globalParameters["Backend"] == "HIP":
-      if solution["ProblemType"]["DataType"].value == DataType.single:
-        if solution["MacroTile0"] == 128 or solution["MacroTile1"] == 128:
-          if solution["NumLoadsCoalescedA"] != 1 and solution["NumLoadsCoalescedB"] != 8:
-            return False
-      elif solution["ProblemType"]["DataType"].value == DataType.double:
-        if globalParameters["Backend"] == "HIP":
-          if solution["MacroTile0"] >= 64 or solution["MacroTile1"] >= 64:
-            return False
-# validation failures
-# Cijk_Ailk_Bjlk_SB_DU16_LU16_MT064_MT164_NLA16_NLB16_NLCA02_NLCB01_NLPA08_NLPB16_TT008_TT108_TTE08_WG008_WG108_WGE08
-# Cijk_Ailk_Bjlk_SB_DU16_LU16_MT064_MT164_NLA16_NLB16_NLCA04_NLCB02_NLPA04_NLPB08_TT008_TT108_TTE08_WG008_WG108_WGE08
-# Cijk_Ailk_Bjlk_SB_DU16_LU16_MT064_MT164_NLA16_NLB16_NLCA02_NLCB04_NLPA08_NLPB04_TT008_TT108_TTE08_WG008_WG108_WGE08
-
-# Cijk_Ailk_Bjlk_DB_DU16_LU16_MT064_MT164_NLA16_NLB16_NLCA04_NLCB01_NLPA04_NLPB16_TT008_TT108_TTE08_WG008_WG108_WGE08
-# Cijk_Ailk_Bjlk_DB_DU08_LU08_MT064_MT164_NLA08_NLB08_NLCA01_NLCB01_NLPA08_NLPB08_TT008_TT108_TTE08_WG008_WG108_WGE08
-# Cijk_Ailk_Bjlk_DB_DU08_LU08_MT064_MT164_NLA08_NLB08_NLCA08_NLCB01_NLPA01_NLPB08_TT008_TT108_TTE08_WG008_WG108_WGE08
-# Cijk_Ailk_Bjlk_DB_DU08_LU08_MT064_MT164_NLA08_NLB08_NLCA08_NLCB08_NLPA01_NLPB01_TT008_TT108_TTE08_WG008_WG108_WGE08
-# Cijk_Ailk_Bjlk_DB_DU16_LU16_MT064_MT164_NLA16_NLB16_NLCA08_NLCB08_NLPA02_NLPB02_TT008_TT108_TTE08_WG008_WG108_WGE08
-# Cijk_Ailk_Bjlk_DB_DU08_LU08_MT064_MT164_NLA08_NLB08_NLCA01_NLCB08_NLPA08_NLPB01_TT008_TT108_TTE08_WG008_WG108_WGE08
-
-    return True
-

From 851416b3844177aeced026d7a17bb4a6fc526e5c Mon Sep 17 00:00:00 2001
From: David Tanner <guacamoleo@gmail.com>
Date: Fri, 24 Feb 2017 16:52:43 -0600
Subject: [PATCH 10/21] enabling max thread tile to support skinny

---
 Tensile/BenchmarkStructs.py | 6 ++++--
 Tensile/Common.py           | 5 +++--
 Tensile/SolutionStructs.py  | 8 ++++++++
 Tensile/__init__.py         | 2 +-
 4 files changed, 16 insertions(+), 5 deletions(-)

diff --git a/Tensile/BenchmarkStructs.py b/Tensile/BenchmarkStructs.py
index 236f67eb9..48bbf9813 100644
--- a/Tensile/BenchmarkStructs.py
+++ b/Tensile/BenchmarkStructs.py
@@ -2,7 +2,7 @@
 # from copy import *
 
 from copy import copy, deepcopy
-from Common import print1, print2, printWarning, defaultSolution, defaultProblemSizes, defaultBenchmarkFinalProblemSizes, defaultBenchmarkCommonParameters, hasParam, defaultBenchmarkJoinParameters, getParamValues, defaultForkParameters, defaultBenchmarkForkParameters, defaultJoinParameters, printExit
+from Common import print1, print2, printWarning, defaultSolution, defaultProblemSizes, defaultBenchmarkFinalProblemSizes, defaultBenchmarkCommonParameters, hasParam, defaultBenchmarkJoinParameters, getParamValues, defaultForkParameters, defaultBenchmarkForkParameters, defaultJoinParameters, printExit, globalParameters
 from SolutionStructs import Solution, ProblemType, ProblemSizes
 
 ################################################################################
@@ -406,7 +406,9 @@ def convertParametersToSteps(self):
             macroTileDim0 *= abs(threadTileShapeValues[threadTileShapeIdx])
           elif threadTileShapeValues[threadTileShapeIdx] > 0:
             macroTileDim1 *= abs(threadTileShapeValues[threadTileShapeIdx])
-          if macroTileDim0/macroTileDim1 <= self.initialSolutionParameters["MacroTileMaxRatio"] and macroTileDim1/macroTileDim0 <= self.initialSolutionParameters["MacroTileMaxRatio"]:
+          # TODO is this still useful?
+          if macroTileDim0/macroTileDim1 <= globalParameters["MaxMacroTileRatio"] \
+              and macroTileDim1/macroTileDim0 <= globalParameters["MaxMacroTileRatio"]:
             macroTileJoinSet.add((macroTileDim0, macroTileDim1))
         totalPermutations *=len(macroTileJoinSet)
         print2("JoinMacroTileSet(%u): %s" % (len(macroTileJoinSet), macroTileJoinSet) )
diff --git a/Tensile/Common.py b/Tensile/Common.py
index 08e85b586..ef2b56533 100644
--- a/Tensile/Common.py
+++ b/Tensile/Common.py
@@ -50,8 +50,9 @@
 # protect against invalid kernel
 globalParameters["MaxThreads"] = 256
 globalParameters["MinThreads"] = 64
-globalParameters["MaxRegisters"] = 256
 globalParameters["MaxLDS"] = 32768
+globalParameters["MaxMacroTileRatio"] = 4
+globalParameters["MaxThreadTile"] = 64
 
 
 ################################################################################
@@ -101,6 +102,7 @@
 derivedParameters = [
     "MacroTile0",
     "MacroTile1",
+    "DepthU",
     "WorkGroup0",
     "WorkGroup1",
     "ThreadTile0",
@@ -120,7 +122,6 @@
     for key, value in paramDict.iteritems():
       defaultSolution[key] = value[0]
 # other non-benchmark options for solutions
-defaultSolution["MacroTileMaxRatio"] = 2
 
 ################################################################################
 # Default Problem Type
diff --git a/Tensile/SolutionStructs.py b/Tensile/SolutionStructs.py
index e4a853804..7eb18fa25 100644
--- a/Tensile/SolutionStructs.py
+++ b/Tensile/SolutionStructs.py
@@ -592,6 +592,14 @@ def assignProblemIndependentDerivedParameters(state):
     if state["NumThreads"] < globalParameters["MinThreads"]:
       if printReason: print2("rejecting %u threads" % state["NumThreads"])
       state["Valid"] = False
+
+    if state["MacroTile0"]/state["MacroTile1"] > globalParameters["MaxMacroTileRatio"] \
+        or state["MacroTile1"]/state["MacroTile0"] > globalParameters["MaxMacroTileRatio"] \
+        or state["ThreadTile0"]*state["ThreadTile1"]*state["ProblemType"]["DataType"].numRegisters() > globalParameters["MaxThreadTile"]:
+      state["Valid"] = False
+
+
+
     state["AssignedProblemIndependentDerivedParameters"] = True
 
   ########################################
diff --git a/Tensile/__init__.py b/Tensile/__init__.py
index 0b167e619..edc60b350 100644
--- a/Tensile/__init__.py
+++ b/Tensile/__init__.py
@@ -1 +1 @@
-__version__ = "2.1.5"
+__version__ = "2.1.6"

From 0b7a7b90ff2a852f7f4ef8a2b56a92814b8ae865 Mon Sep 17 00:00:00 2001
From: David Tanner <guacamoleo@gmail.com>
Date: Fri, 24 Feb 2017 17:07:05 -0600
Subject: [PATCH 11/21] moving tile shape to problem-size dependent b/c
 numRegisters

---
 Tensile/SolutionStructs.py | 12 ++++++++----
 1 file changed, 8 insertions(+), 4 deletions(-)

diff --git a/Tensile/SolutionStructs.py b/Tensile/SolutionStructs.py
index 7eb18fa25..7dc27a644 100644
--- a/Tensile/SolutionStructs.py
+++ b/Tensile/SolutionStructs.py
@@ -593,15 +593,15 @@ def assignProblemIndependentDerivedParameters(state):
       if printReason: print2("rejecting %u threads" % state["NumThreads"])
       state["Valid"] = False
 
+    # tile shape
     if state["MacroTile0"]/state["MacroTile1"] > globalParameters["MaxMacroTileRatio"] \
-        or state["MacroTile1"]/state["MacroTile0"] > globalParameters["MaxMacroTileRatio"] \
-        or state["ThreadTile0"]*state["ThreadTile1"]*state["ProblemType"]["DataType"].numRegisters() > globalParameters["MaxThreadTile"]:
+        or state["MacroTile1"]/state["MacroTile0"] > globalParameters["MaxMacroTileRatio"]:
       state["Valid"] = False
 
-
-
+    # done
     state["AssignedProblemIndependentDerivedParameters"] = True
 
+
   ########################################
   # assign all derived parameters
   @staticmethod
@@ -615,6 +615,10 @@ def assignDerivedParameters(state):
     ProblemType.assignDerivedParameters(state["ProblemType"])
     printReason = False
 
+    # tile size
+    if state["ThreadTile0"]*state["ThreadTile1"]*state["ProblemType"]["DataType"].numRegisters() > globalParameters["MaxThreadTile"]:
+      state["Valid"] = False
+
     # how many elements to load
     if state["ProblemType"]["TLUA"]:
       totalElementsCoalescedA = state["MacroTile0"]

From c610aefdc8f50280d9a6eec21a60c41786c17e3d Mon Sep 17 00:00:00 2001
From: David Tanner <guacamoleo@gmail.com>
Date: Wed, 1 Mar 2017 11:15:27 -0600
Subject: [PATCH 12/21] revising logic to handle global problems

---
 Tensile/Common.py          |    5 +
 Tensile/LibraryLogic.py    | 1098 +++++++++++++++++++++++++++++++-----
 Tensile/SolutionStructs.py |    2 +
 3 files changed, 954 insertions(+), 151 deletions(-)

diff --git a/Tensile/Common.py b/Tensile/Common.py
index ef2b56533..557690f79 100644
--- a/Tensile/Common.py
+++ b/Tensile/Common.py
@@ -154,6 +154,11 @@
 defaultAnalysisParameters = {
     "Dilation":                 3,
     "Threshold":                0.1,
+    "OutlierThreshold":         0,
+    "FractionTimeSavedMin":     0.01, # = 1%
+    "Weight0":                  100,
+    "Weight1":                  100,
+    "Weight2":                  100,
     }
 
 
diff --git a/Tensile/LibraryLogic.py b/Tensile/LibraryLogic.py
index 881631f2b..72b8d60cb 100644
--- a/Tensile/LibraryLogic.py
+++ b/Tensile/LibraryLogic.py
@@ -12,7 +12,7 @@
 ################################################################################
 # Analyze Problem Type
 ################################################################################
-def analyzeProblemType( problemTypeTuple, analysisParameters ):
+def analyzeProblemType( problemTypeTuple, inputParameters ):
   problemType = problemTypeTuple[0]
   problemSizes = problemTypeTuple[1]
   dataFileName = problemTypeTuple[2]
@@ -23,6 +23,7 @@ def analyzeProblemType( problemTypeTuple, analysisParameters ):
   #print "#  %s" % dataFileName
   #print "#  %s" % solutionsFileName
 
+  ######################################
   # Read Solutions
   (problemSizes, solutions) = YAMLIO.readSolutions(solutionsFileName)
   print2("# ProblemSizes: %s" % problemSizes)
@@ -30,130 +31,45 @@ def analyzeProblemType( problemTypeTuple, analysisParameters ):
   print2("# Solutions:")
   solutionIdx = 0
   for solution in solutions:
-    print2("#  (%u) %s" % (solutionIdx, Solution.getNameMin(solution, solutionMinNaming)))
+    print2("#  (%u) %s" % (solutionIdx, Solution.getNameMin(solution, \
+        solutionMinNaming)))
     solutionIdx += 1
   print2(HR)
 
+  ######################################
   # Read Data From CSV
-  #numProblemSizes = problemSizes.numProblemSizes
-  data = BenchmarkDataAnalyzer(problemType, problemSizes, solutions, \
-      analysisParameters)
-  data.populateFromCSV(dataFileName)
-
-  ##############################################################################
-  # Determine Solutions Along Diagonal
-  # roughly same splitting regardless of sizeU
-  problemIndices = []
-  for numProblemsForIndex in data.numProblemSizes:
-    problemIndices.append(numProblemsForIndex-1)
-  diagonalRules = data.getFastestSolutionsAlongDiagonal(problemIndices)
-  if True:
-    print2("Diagonal Rules:")
-    for rule in diagonalRules:
-      string = "  if freeSize >=%4u" % data.problemIndexToSize[0][rule[1][0]]
-      for i in range(1, data.numIndices):
-        string += "x%4u" % data.problemIndexToSize[i][rule[1][i]]
-      string += " return S[%u] @ %5.0f-%5.0f>%5.0f GFlops is %s" \
-          % (rule[0], rule[2], rule[3], rule[4], \
-          data.solutionNames[rule[0]])
-      print2(string)
-
-  ##############################################################################
-  # Determine Skinny0 Solutions
-  skinnyRules01 = data.getSkinnySolutions(diagonalRules, problemIndices, \
-      data.idx0, data.idx1)
-  #print "Skinny Rules:"
-  #for rule in skinnyRules01:
-  #  string = "  if freeSize >=%4u" % data.problemIndexToSize[0][rule[1][0]]
-  #  for i in range(1, data.numIndices):
-  #    string += "x%4u" % data.problemIndexToSize[i][rule[1][i]]
-  #  string += " return S[%u] @ %5.0f-%5.0f>%5.0f GFlops is %s" \
-  #      % (rule[0], rule[2], rule[3], rule[4], \
-  #      data.solutionNames[rule[0]])
-
-  ##############################################################################
-  # Determine Skinny1 Solutions
-  skinnyRules10 = data.getSkinnySolutions(diagonalRules, problemIndices, \
-      data.idx1, data.idx0)
-
-  # list solutions that actually get used
-  solutionIndicesUsed = []
-  for rule in skinnyRules01:
-    pass
-  for rule in skinnyRules10:
-    pass
-  for rule in diagonalRules:
-    solutionIdx = rule[0]
-    solution = solutions[solutionIdx]
-    MT0 = solution["MacroTile0"]
-    MT1 = solution["MacroTile1"]
-    DU = solution["DepthU"]
-    #print "Rule Tile S[%u]: %ux%ux%u" % (solutionIdx, MT0, MT1, DU)
-    # is this solution in the list
-    inList = False
-    for solutionUsed in solutionIndicesUsed:
-      if solutionUsed[0] == solutionIdx:
-        inList = True
-        break
-    if not inList:
-      insertIdx = len(solutionIndicesUsed)
-      for i in range(0, len(solutionIndicesUsed)):
-        iMT0 = solutionIndicesUsed[i][1]
-        iMT1 = solutionIndicesUsed[i][2]
-        iDU  = solutionIndicesUsed[i][3]
-        #print "  compare S[%u]: %ux%ux%u" % (solutionIndicesUsed[i][0], \
-        #    iMT0, iMT1, iDU)
-        if MT0*MT1 < iMT0*iMT1:
-          insertIdx = i
-          break
-        elif MT0*MT1 > iMT0*iMT1:
-          continue
-        else: # MT == MT
-          if DU < iDU:
-            insertIdx = i
-            break
-          else:
-            continue
+  logic = LogicAnalyzer(problemType, problemSizes, solutions, inputParameters)
+  logic.populateFromCSV(dataFileName)
 
-        # if i'm smaller than i, insert me before i
-      #print "insert: %u" % insertIdx
-      solutionIndicesUsed.insert(insertIdx, [solutionIdx, MT0, MT1, DU])
-  #print solutionIndicesUsed
+  ######################################
+  # Remove invalid solutions
+  logic.removeInvalidSolutions()
 
-  # list of solutions used
-  solutionsUsed = []
-  for solutionIndexUsed in solutionIndicesUsed:
-    solutionsUsed.append(solutions[solutionIndexUsed[0]])
+  ######################################
+  # Remove least important solutions
+  logic.removeLeastImportantSolutions()
 
-  # translate rules to new solution indices
-  for rule in skinnyRules01:
-    pass
-  for rule in skinnyRules10:
-    pass
-  for ruleIdx in range(0, len(diagonalRules)):
-    solutionIdx = diagonalRules[ruleIdx][0]
-    for i in range(0, len(solutionIndicesUsed)):
-      solutionIndexUsed = solutionIndicesUsed[i]
-      if solutionIdx == solutionIndexUsed[0]:
-        diagonalRules[ruleIdx][0] = i
-        break
-    # change problemSizeIndices to sizes
-    for i in range(0, 3):
-      diagonalRules[ruleIdx][1][i] = \
-          data.problemIndexToSize[i][ diagonalRules[ruleIdx][1][i] ]
+  ######################################
+  # Correct outliers
+  # logic.smooth()
+  logic.print2D([0, 0])
+
+  ######################################
+  # Create Rules
+  logic.enRule(0, logic.globalIndexRange)
 
-  print2("# New Rules: %s" % diagonalRules)
 
 
   #return (skinnyRules01, skinnyRules10, diagonalRules)
-  return (problemType, solutionsUsed, [], [], diagonalRules )
+  #return (problemType, logic.solutionsUsed, [], [], logic.diagonalRules )
+  return (problemType, [], [], [], [] )
 
 
 
 ################################################################################
-# BenchmarkDataAnalyzer
+# LogicAnalyzer
 ################################################################################
-class BenchmarkDataAnalyzer:
+class LogicAnalyzer:
 
   ########################################
   # diagonal rule looks like
@@ -169,26 +85,39 @@ class BenchmarkDataAnalyzer:
   # 1: problemIndices for minThreshold problem
   # 2: gflops at above minSize
 
+  ##############################################################################
+  ##############################################################################
+  ###
+  ###  Entry / Top-Level Functions
+  ###
+  ##############################################################################
+  ##############################################################################
 
-  def __init__(self, problemType, problemSizes, solutions, analysisParameters):
+  ##############################################################################
+  # ENTRY: Init
+  ##############################################################################
+  def __init__(self, problemType, problemSizes, solutions, inputParameters):
     self.problemType = problemType
     self.problemSizes = problemSizes
-    self.analysisParameters = analysisParameters
+    self.parameters = inputParameters
     print2("ProblemSizes: %s" % self.problemSizes)
     # TODO verify that data is symmetric for diagonal
     #if self.problemSizes[self.problemType["Index0"]] \
     #    != self.problemSizes[self.problemType["Index1"]]:
     #  printExit("d0 / d1 must be symmetric for analysis.")
     self.numProblemSizes = problemSizes.numProblemSizes # native order
-    print2("NumProblemSizes: %s" % self.numProblemSizes)
+    print1("NumProblemSizes: %s" % self.numProblemSizes)
     self.numIndices = len(self.numProblemSizes)
     self.solutions = solutions
     self.numSolutions = len(self.solutions)
     self.solutionMinNaming = Solution.getMinNaming(solutions)
     self.solutionNames = []
+    self.solutionTiles = []
     for solution in self.solutions:
       self.solutionNames.append(Solution.getNameMin(solution, \
           self.solutionMinNaming))
+      self.solutionTiles.append("%ux%u"%(solution["MacroTile0"], solution["MacroTile1"]))
+    self.flopsPerMac = self.problemType["DataType"].flopsPerMac()
 
     # special indices
     self.idx0 = self.problemType["Index0"]
@@ -229,13 +158,27 @@ def __init__(self, problemType, problemSizes, solutions, analysisParameters):
         currentSize += currentStride
         currentStride += index[2]
         idx += 1
+    self.rangeIndicesFree = range(0, self.problemType["NumIndicesC"])
+    self.rangeIndicesSummation = range(self.problemType["NumIndicesC"], \
+        self.problemType["TotalIndices"])
+    self.w0 = self.parameters["Weight0"]
+    self.w1 = self.parameters["Weight1"]
+    self.w2 = self.parameters["Weight2"]
     #print "S->I %s" % self.problemSizeToIndex
     #print "I->S %s" % self.problemIndexToSize
+    self.indexOrder = self.recommendedIndexOrder()
+    print2("IndexOrder: %s" % self.indexOrder)
+    self.globalIndexRange = []
+    for i in range(0, self.numIndices):
+      self.globalIndexRange.append([0, self.numProblemSizes[i]])
+    self.problemIndicesForGlobalRange \
+        = self.problemIndicesForRange(self.globalIndexRange)
 
 
 
   ##############################################################################
-  # Read In CSV
+  # ENTRY: Read In CSV
+  ##############################################################################
   def populateFromCSV(self, dataFileName):
 
     # open file
@@ -288,14 +231,473 @@ def populateFromCSV(self, dataFileName):
 
 
   ##############################################################################
-  # Get Fastest Solutions Along Diagonal (d0=d1) for largest sizes
+  # ENTRY: Remove Invalid Solutions
+  ##############################################################################
+  def removeInvalidSolutions(self):
+    #problemIndices = [0]*self.numIndices
+    allSolutionValid = False
+    while not allSolutionValid:
+      moreProblems = True
+      invalidIdx = -1
+      for problemIndices in self.problemIndicesForGlobalRange:
+        problemIdx = self.indicesToSerial(0, problemIndices)
+        for solutionIdx in range(0, self.numSolutions):
+          gflops = self.data[problemIdx+solutionIdx]
+          if gflops == 0:
+            invalidIdx = solutionIdx
+            break
+      if invalidIdx >= 0:
+        print1("# Removing Invalid Solution: %u %s" \
+            % (invalidIdx, self.solutionNames[invalidIdx]) )
+        self.removeSolution(invalidIdx)
+      else:
+        allSolutionValid = True
+
+
+  ##############################################################################
+  # ENTRY: Remove Least Important Solutions
+  ##############################################################################
+  def removeLeastImportantSolutions(self):
+    # Remove least important solutions
+    while True:
+      (lisIdx, lisPercSaved, lisPercWins, lisPercExec) \
+          = self.leastImportantSolution()
+      if lisPercSaved < self.parameters["FractionTimeSavedMin"]:
+        self.removeSolution(lisIdx)
+        continue
+      else:
+        break
+
+
+  ##############################################################################
+  # ENTRY: Smooth - correct outliers
+  ##############################################################################
+  def smooth(self):
+    outlierThreshold = self.parameters["OutlierThreshold"]
+    problemSizes = [0]*self.numIndices
+    for problemIndices in self.problemIndicesForGlobalRange:
+      problemIdx = self.indicesToSerial(0, problemIndices)
+
+      for solutionIdx in range(0, self.numSolutions):
+        gflops = self.data[problemIdx+solutionIdx]
+        neighborGFlops = []
+        smoothProblem = False
+        for iIdx in range(0, self.numIndices):
+          if problemIndices[iIdx] > 0 \
+              and problemIndices[iIdx] < self.numProblemSizes[iIdx]-1:
+            neighborBeforeIndices = deepcopy(problemIndices)
+            neighborAfterIndices = deepcopy(problemIndices)
+            neighborBeforeIndices[iIdx] -= 1
+            neighborAfterIndices[iIdx] += 1
+            neighborBeforeIdx = self.indicesToSerial(0, neighborBeforeIndices)
+            neighborAfterIdx = self.indicesToSerial(0, neighborAfterIndices)
+            neighborBeforeGFlops = self.data[neighborBeforeIdx+solutionIdx]
+            neighborAfterGFlops = self.data[neighborAfterIdx+solutionIdx]
+            neighborGFlops.append(neighborBeforeGFlops)
+            neighborGFlops.append(neighborAfterGFlops)
+            if neighborBeforeGFlops > gflops * (1+outlierThreshold) \
+                and neighborAfterGFlops * (1+outlierThreshold) < gflops :
+              smoothProblem = True
+        if smoothProblem:
+          s = ""
+          for i in range(0, self.numIndices):
+            problemSizes[i] = self.problemIndexToSize[i][problemIndices[i]]
+            s += "%u, " % problemSizes[i]
+          new = sum(neighborGFlops)/len(neighborGFlops)
+          old = self.data[problemIdx+solutionIdx]
+          s += "%f -> %f" % (old, new)
+          print s
+          self.data[problemIdx+solutionIdx] \
+              = sum(neighborGFlops)/len(neighborGFlops)
+
+
+  ##############################################################################
+  # ENTRY: En Rule
+  # currentIndexIndex = 0, 1, 2, 3...
+  ##############################################################################
+  def enRule(self, currentIndexIndex, currentIndexRange):
+    currentIndex = self.indexOrder[currentIndexIndex]
+    lastIndex = currentIndexIndex == self.numIndices-1
+
+    # if there's only 1 problem size here
+    if currentIndexRange[currentIndex][1] \
+        - currentIndexRange[currentIndex][0] == 1:
+      # this is last index, so just return fastest solution
+      if lastIndex:
+        scores = scoreRangeForSolutions(currentIndexRange)
+        winnerIdx = 0
+        for solutionIdx in range(1, self.numSolution):
+          if scores[solutionIdx] < scores[winnerIdx]:
+            winnerIdx = solutionIdx
+        rule = [ -1, winnerIdx ]
+      # this isn't last index, so just return next index
+      else:
+        newIndexIndex = currentIndexIndex+1
+        newIndexRange = deepcopy(currentIndexRange)
+        rule = [ -1, self.enRule(newIndexIndex, newIndexRange) ]
+
+    # create rule for smallest size
+
+    # for all problem indices in this index
+    for problemIndex in range(currentIndexRange[currentIndex][0], \
+        currentIndexRange[currentIndex][1]):
+    # rules = seed with smallest rule
+    # for dimIdx = 0 -> numSizes
+      # if newRule
+        # score range using newRule
+        # score range using priorRule
+        # accept/reject based on score
+    # current index is dimOrder[0]
+
+
+
+
+
+    sumValues = []
+    totalSummationSizes = 1
+    for i in self.rangeIndicesSummation:
+      totalSummationSizes *= self.numProblemSizes[i]
+    summationPermutations = []
+    for permutationIdx in range(0, totalSummationSizes):
+      permutation = []
+      permutationSize = 1
+      pIdx = permutationIdx
+      for i in self.rangeIndicesSummation:
+        idx = pIdx % self.numProblemSizes[i]
+        permutation.append(idx)
+        permutationSize *= self.problemIndexToSize[i][idx]
+        pIdx /= self.numProblemSizes[i]
+      # insert permutation in sorted order
+      insertIdx = len(summationPermutations)-1
+      for pIdx in range(0, len(summationPermutations)):
+        size = 1
+        for i in self.rangeIndicesSummation:
+          size *= self.problemIndexToSize[i][summationPermutations[pIdx][i]]
+        if permutationSize > size:
+          insertIdx = pIdx
+          break
+      summationPermutations.insert(insertIdx, permutation)
+    print "SummationPermutations:", summationPermutations
+
+
+    if len(summationPermutations) == 1:
+      rules = [ 0, self.createRules01(summationPermutations[0]) ]
+      return rules
+    else:
+      printExit("No Logic to support multiple summation sizes.")
+      # iterate over summation permutations
+# for each serial pair, scoreA, scoreB, scoreAB
+# keep rule AB if scoreAB isn't much slower than scoreA + scoreB
+
+    """
+    sizeSummation *= self.problemIndexToSize[i][problemIndices[i]]
+
+    firstProblemIndices = []
+    lastProblemIndices = []
+    for i in range(0, self.numIndices):
+      firstProblemIndices.append(0)
+      lastProblemIndices.append(self.numProblems[i]-1)
+    minSumValue = self.getSizeSummation(firstProblemIndices)
+    maxSumValue = self.getSizeSummation(lastProblemIndices)
+    numSumValues =
+
+
+    rule = [
+        [
+          minU,                             # k threshold
+          [[min01,s], [0,s]],               # diagonals
+          [0, max0, [[min1,s], [min1,s]]],  # skinny0's
+          [1, max1, [[min0,s], [min0,s]]],  # skinny1's
+        ],
+        [
+          minU,                             # k threshold
+          [[min01,s], [0,s]],               # diagonals
+          [0, max0, [[min1,s], [min1,s]]],  # skinny0's
+          [1, max1, [[min0,s], [min0,s]]],  # skinny1's
+        ],
+    ]
+
+    ruleA = createRules01()
+    ruleB = createRules01()
+
+    minSumValue = 0
+    maxSumValue = self.numProblems
+
+
+    sizeSummation = 1
+    for i in range(self.problemType["NumIndicesC"], \
+        self.problemType["TotalIndices"]):
+      sizeSummation *= self.problemIndexToSize[i][problemIndices[i]]
+    return sizeSummation
+    """
+
+
+
+  ##############################################################################
+  ##############################################################################
+  ###
+  ###  Mid-Level Functions
+  ###
+  ##############################################################################
+  ##############################################################################
+
+
+
   ##############################################################################
-  def getFastestSolutionsAlongDiagonal(self, problemIndices):
-    print2("\nFastest Diagonal idxU: %u" % problemIndices[self.idxU])
+  # Create Rules dim0 / dim1
+  ##############################################################################
+  def createRules01(self, problemSizeSummation ):
+
+    diagonalRules = self.createRulesDiagonal(problemSizeSummation)
+
+
+  ##############################################################################
+  # Create Rules Diagonal
+  ##############################################################################
+  def createRulesDiagonal(self, problemSizeSummation):
+    thresholdForDiagonality = 1.5 # slightly fewer problems than 2
+    numProblemSizesFastestDiagonal = 16
+    problemIndices = [0]*self.numIndices
+    for i in self.rangeIndicesSummation:
+      problemIndices[i] = problemSizeSummation[i \
+          - self.problemType["NumIndicesC"]]
+    print2("\nDiagonalRules for %s" % problemIndices)
+    problemSizes = [0]*self.numIndices
+    totalFlopsPerSizeFree = self.flopsPerMac
+    for i in self.rangeIndicesSummation:
+      totalFlopsPerSizeFree *= self.problemIndexToSize[i][problemIndices[i]]
+    print "totalFlopsPerSizeFree", totalFlopsPerSizeFree
+
+    ########################################
+    # transform data into serial list of "diagonal problem sizes"
+    diagonalData = []
+    moreProblems = True
+    while moreProblems:
+
+      # size free
+      for i in range(0, self.numIndices):
+        problemSizes[i] = self.problemIndexToSize[i][problemIndices[i]]
+      size0 = problemSizes[self.idx0]
+      size1 = problemSizes[self.idx1]
+
+      # if diagonal
+      if size0 < size1*thresholdForDiagonality \
+          and size1 < size0*thresholdForDiagonality:
+        sizeFree = self.getSizeFree(problemIndices)
+
+        problemIdx = self.indicesToSerial(0, problemIndices)
+        solutionGFlops = []
+        for i in range(0, self.numSolutions):
+          solutionGFlops.append(self.data[problemIdx+i])
+
+        diagonalData.append([ sizeFree, solutionGFlops ])
+
+      # next problem
+      problemIndices[0] += 1
+      for i in self.rangeIndicesFree:
+        if problemIndices[i] >= self.numProblemSizes[i]:
+          if i == self.problemType["NumIndicesFree"]-1:
+            moreProblems = False
+            break
+          else:
+            problemIndices[i] = 0
+            problemIndices[i+1] += 1
+        else:
+          break
+
+    diagonalData.sort(key=lambda x: x[0], reverse=True)
+    for dd in diagonalData:
+      print "DD[%u]: %s" % (dd[0], dd[1])
+    print len(diagonalData)
+
+
+    ########################################
+    # create first rule
+    sizeFree = diagonalData[0][0]
+    relativeTime = [0]*self.numSolutions
+    for i in range(0, numProblemSizesFastestDiagonal):
+      for j in range(0, self.numSolutions):
+        gflops = diagonalData[i][1][j]
+        relativeTime[j] += 1 / gflops
+    winnerIdx = 0
+    winnerRelativeTime = relativeTime[0]
+    for i in range(1, self.numSolutions):
+      if relativeTime[i] < winnerRelativeTime:
+        winnerIdx = i
+        winnerRelativeTime = relativeTime[i]
+    print "FastestDiagonalSolution:", winnerIdx, self.solutionNames[winnerIdx]
+    fastestGFlops = 0
+    for i in range(0, numProblemSizesFastestDiagonal):
+      gflops = diagonalData[i][1][winnerIdx]
+      if gflops > fastestGFlops:
+        fastestGFlops = gflops
+
+    rules = []
+    #                                  minGFlops      maxGFlops      oldGFlops?
+    rules.append([winnerIdx, sizeFree, fastestGFlops, fastestGFlops, -1])
+    print "Winner[%3u]: %u" % (0, winnerIdx)
+# we can't just pay attention to single winner
+# we need to compute scores for all solutions over a window
+# b/c 441115111333
+#   = 441555555333
+#
+# we can do a smoothing pass to get rid of bogus data; if a data point is more than x% slower than 4 surrounding points, than its bogus, just set it equal to average of 4 surrounding points
+#
+
+    ########################################
+    # create subsequent rules for smaller sizes
+    for diagonalDataIdx in range(1, len(diagonalData)):
+      print "DiagonalDataIdx:", diagonalDataIdx
+      # prior rule
+      priorRule = rules[len(rules)-1]
+      priorWinnerIdx = priorRule[0]
+      # candidate winner
+      candidateWinnerIdx = 0
+      candidateWinnerGFlops = diagonalData[diagonalDataIdx][1][0]
+      for j in range(1, self.numSolutions):
+        gflops = diagonalData[diagonalDataIdx][1][j]
+        if gflops > candidateWinnerGFlops:
+          candidateWinnerIdx = j
+          candidateWinnerGFlops = gflops
+      if candidateWinnerIdx == priorWinnerIdx:
+        # update prior rule to include this sizeFree
+        rules[len(rules)-1][1] = diagonalData[diagonalDataIdx][0] # size free
+        rules[len(rules)-1][2] = \
+            diagonalData[diagonalDataIdx][1][priorWinnerIdx] # perf at size
+        continue
+      else:
+        # candidate rule
+        sizeFree = diagonalData[diagonalDataIdx][0]
+        totalFlops = sizeFree*totalFlopsPerSizeFree
+        candidateGFlops = diagonalData[diagonalDataIdx][1][candidateWinnerIdx]
+        priorGFlops = diagonalData[diagonalDataIdx][1][priorWinnerIdx]
+        candidateRule = [ candidateWinnerIdx, sizeFree, candidateGFlops, \
+            candidateGFlops, -1 ]
+        # candidate and prior scores
+        candidateTimeUs = totalFlops / candidateGFlops / 1000
+        priorTimeUs = totalFlops / priorGFlops / 1000
+        candidateScore = 1*self.w2 + candidateTimeUs
+        priorScore = 0*self.w2 + priorTimeUs
+        print "DDI[%3u] Prior[%2u]: %.0fus vs Candi[%2u]: %.0fus" \
+            % (diagonalDataIdx, priorWinnerIdx, priorScore, candidateWinnerIdx, candidateScore)
+        checkMoreProblems = True
+        for newDiagonalDataIdx in range(diagonalDataIdx+1, len(diagonalData)):
+          newWinnerIdx = 0
+          newWinnerGFlops = diagonalData[newDiagonalDataIdx][1][0]
+          for j in range(1, self.numSolutions):
+            gflops = diagonalData[newDiagonalDataIdx][1][j]
+            if gflops > newWinnerGFlops:
+              newWinnerIdx = j
+              newWinnerGFlops = gflops
+          # update candidate and prior scores
+          sizeFree = diagonalData[newDiagonalDataIdx][0]
+          totalFlops = sizeFree*totalFlopsPerSizeFree
+          candidateGFlops = \
+              diagonalData[newDiagonalDataIdx][1][candidateWinnerIdx]
+          priorGFlops = diagonalData[newDiagonalDataIdx][1][priorWinnerIdx]
+          candidateTimeUs = totalFlops / candidateGFlops / 1000
+          priorTimeUs = totalFlops / priorGFlops / 1000
+          candidateScore += candidateTimeUs
+          priorScore += priorTimeUs
+          print "  NDDI[%3u] Prior[%2u]: %.0fus vs Candi[%2u]: %.0fus" \
+              % (newDiagonalDataIdx, priorWinnerIdx, priorScore, \
+              candidateWinnerIdx, candidateScore)
+          if newWinnerIdx == candidateWinnerIdx:
+            print "    newWinnerIdx == candidateWinnerIdx"
+            if candidateScore < priorScore:
+              # candidate rule accepted
+              rules.append(candidateRule)
+              print "      accepting"
+              break
+            else:
+              # candidate rule not yet accepted
+              candidateRule[1] = sizeFree
+              candidateRule[2] = candidateGFlops
+              print "      continuing"
+              continue
+          elif newWinnerIdx == priorWinnerIdx:
+            print "    newWinnerIdx == priorWinnerIdx"
+            # returned to original winner, decide now to accept/reject
+            if candidateScore < priorScore:
+              # candidate rule accepted
+              rules.append(candidateRule)
+              print "      accepting"
+              break
+            else:
+              # candidate rule rejected; update prior, continue at newSize
+              rules[len(rules)-1][1] = sizeFree
+              rules[len(rules)-1][2] = priorGFlops
+              diagonalDataIdx = newDiagonalDataIdx
+              print "      rejecting"
+              break
+          else:
+            print "    newWinnerIdx is %u" % newWinnerIdx
+            # new winner was a 3rd solution; decide now (same as above)
+            if candidateScore < priorScore:
+              # candidate rule accepted
+              rules.append(candidateRule)
+              print "      accepting"
+              break
+            else:
+              # candidate rule rejected; update prior, continue at newSize
+              rules[len(rules)-1][1] = diagonalData[newDiagonalDataIdx][0]
+              rules[len(rules)-1][2] = \
+                  diagonalData[newDiagonalDataIdx][1][priorWinnerIdx]
+              diagonalDataIdx = newDiagonalDataIdx
+              print "      rejecting"
+              break
+
+      return
+
+        # go farther forward, does candidate rule keep winning, or does priorRule keep winning?
+        # the new rule should start at a loss b/c of Weight2
+        # a few problems in the future
+            # if new rule is better, W2 gets amortized, Wt improves
+            # if new rule is worse, W2 gets amortized, Wt worsens
+        # continue to future problems until, and make final decision
+          # newRule gets better score; accept
+          # return to priorRule winner; accept/reject
+          # Yet a new winner
+            # easy: make final accept/reject including this new problem size
+            # hard: recure?
+          #
+        # is the num problems in future vary with W2,Wt?
+# Wt = 1
+# W2 = 1 means we would rather lose 1us per kernel rather than adding another split (actually they're equal)
+# so, in order for candidate to be accepted immediately, it must improve all kernels by more than 1us, or after 2 sizes, improve by 0.5us per kernel
+#
+#
+# 0 0 1 0 0
+# 0 0 1 1 0
+# 0 0 1 4 0
+# 0 0 1 4 1 0
+#
+
+      print "Winner[%3u]: %u" % (i, winnerIdx)
+
+
+    return
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
     # abstract to multidimensions
     # what is the diagonal
-    dilation = self.analysisParameters["Dilation"]
-    threshold = self.analysisParameters["Threshold"]
+    dilation = self.self.parameters["Dilation"]
+    threshold = self.self.parameters["Threshold"]
     numProblems0 = self.numProblemSizes[self.idx0]
 
     ############################################################################
@@ -429,8 +831,8 @@ def getSkinnySolutions(self, diagonalRules, problemIndices, \
     idx0 = self.idx0
     idx1 = self.idx1
     #idxU = self.idxU
-    #dilation = self.analysisParameters["Dilation"]
-    threshold = self.analysisParameters["Threshold"]
+    #dilation = self.self.parameters["Dilation"]
+    threshold = self.self.parameters["Threshold"]
 
     skinnyRules = []
 
@@ -500,42 +902,374 @@ def getSkinnySolutions(self, diagonalRules, problemIndices, \
     return skinnyRules
     # end skinny solutions
 
+
   ##############################################################################
-  # Get Size Free and Summation
+  # Determine Logic Along U
   ##############################################################################
-  def getWinnerForProblem(self, problemIndices):
-    problemIdx = self.indicesToSerial(0, problemIndices)
-    winnerIdx = -1
-    winnerGFlops = -1
-    for solutionIdx in range(0, self.numSolutions):
-      solutionSerialIdx = problemIdx + solutionIdx
-      solutionGFlops = self.data[solutionSerialIdx]
-      if solutionGFlops > winnerGFlops:
-        #print "%f > %f" % (solutionGFlops, winnerGFlops)
-        winnerIdx = solutionIdx
-        winnerGFlops = solutionGFlops
-    return (winnerIdx, winnerGFlops)
+  def determineLogicAlongU(self):
+    globalRange = []
+    for i in range(0, self.numIndices):
+      globalRange.append( [0, self.numProblemSizes[i]] )
+
+
+
+
+    self.print2D([0, 0])
+
+    ############################################################################
+    # Determine Solutions Along Diagonal
+    # roughly same splitting regardless of sizeU
+    problemIndices = []
+    for numProblemsForIndex in self.numProblemSizes:
+      problemIndices.append(numProblemsForIndex-1)
+    print problemIndices
+    self.diagonalRules = self.getFastestSolutionsAlongDiagonal(problemIndices)
+    if True:
+      print2("Diagonal Rules:")
+      for rule in self.diagonalRules:
+        string = "  if freeSize >=%4u" % self.problemIndexToSize[0][rule[1][0]]
+        for i in range(1, self.numIndices):
+          string += "x%4u" % self.problemIndexToSize[i][rule[1][i]]
+        string += " return S[%u] @ %5.0f-%5.0f>%5.0f GFlops is %s" \
+            % (rule[0], rule[2], rule[3], rule[4], \
+            self.solutionNames[rule[0]])
+        print2(string)
+
+    ############################################################################
+    # Determine Skinny0 Solutions
+    skinnyRules01 = self.getSkinnySolutions(self.diagonalRules, problemIndices, \
+        self.idx0, self.idx1)
+    #print "Skinny Rules:"
+    #for rule in skinnyRules01:
+    #  string = "  if freeSize >=%4u" % data.problemIndexToSize[0][rule[1][0]]
+    #  for i in range(1, data.numIndices):
+    #    string += "x%4u" % data.problemIndexToSize[i][rule[1][i]]
+    #  string += " return S[%u] @ %5.0f-%5.0f>%5.0f GFlops is %s" \
+    #      % (rule[0], rule[2], rule[3], rule[4], \
+    #      data.solutionNames[rule[0]])
+
+    ############################################################################
+    # Determine Skinny1 Solutions
+    skinnyRules10 = self.getSkinnySolutions(self.diagonalRules, problemIndices, \
+        self.idx1, self.idx0)
+
+    # list solutions that actually get used
+    solutionIndicesUsed = []
+    for rule in skinnyRules01:
+      pass
+    for rule in skinnyRules10:
+      pass
+    for rule in self.diagonalRules:
+      solutionIdx = rule[0]
+      solution = self.solutions[solutionIdx]
+      MT0 = solution["MacroTile0"]
+      MT1 = solution["MacroTile1"]
+      DU = solution["DepthU"]
+      #print "Rule Tile S[%u]: %ux%ux%u" % (solutionIdx, MT0, MT1, DU)
+      # is this solution in the list
+      inList = False
+      for solutionUsed in solutionIndicesUsed:
+        if solutionUsed[0] == solutionIdx:
+          inList = True
+          break
+      if not inList:
+        insertIdx = len(solutionIndicesUsed)
+        for i in range(0, len(solutionIndicesUsed)):
+          iMT0 = solutionIndicesUsed[i][1]
+          iMT1 = solutionIndicesUsed[i][2]
+          iDU  = solutionIndicesUsed[i][3]
+          #print "  compare S[%u]: %ux%ux%u" % (solutionIndicesUsed[i][0], \
+          #    iMT0, iMT1, iDU)
+          if MT0*MT1 < iMT0*iMT1:
+            insertIdx = i
+            break
+          elif MT0*MT1 > iMT0*iMT1:
+            continue
+          else: # MT == MT
+            if DU < iDU:
+              insertIdx = i
+              break
+            else:
+              continue
+
+          # if i'm smaller than i, insert me before i
+        #print "insert: %u" % insertIdx
+        solutionIndicesUsed.insert(insertIdx, [solutionIdx, MT0, MT1, DU])
+    #print solutionIndicesUsed
+
+    # list of solutions used
+    self.solutionsUsed = []
+    for solutionIndexUsed in solutionIndicesUsed:
+      self.solutionsUsed.append(self.solutions[solutionIndexUsed[0]])
+
+    # translate rules to new solution indices
+    for rule in skinnyRules01:
+      pass
+    for rule in skinnyRules10:
+      pass
+    for ruleIdx in range(0, len(self.diagonalRules)):
+      solutionIdx = self.diagonalRules[ruleIdx][0]
+      for i in range(0, len(solutionIndicesUsed)):
+        solutionIndexUsed = solutionIndicesUsed[i]
+        if solutionIdx == solutionIndexUsed[0]:
+          self.diagonalRules[ruleIdx][0] = i
+          break
+      # change problemSizeIndices to sizes
+      for i in range(0, 3):
+        self.diagonalRules[ruleIdx][1][i] = \
+            self.problemIndexToSize[i][ self.diagonalRules[ruleIdx][1][i] ]
+
+    print2("# New Rules: %s" % self.diagonalRules)
+
 
 
   ##############################################################################
-  # Get Size Free and Summation
   ##############################################################################
-  def getSizeFree(self, problemIndices):
-    sizeFree = 1
-    for i in range(0, self.problemType["NumIndicesC"]):
-      sizeFree *= self.problemIndexToSize[i][problemIndices[i]]
-    return sizeFree
+  ###
+  ###  Helper / Low-Level Functions
+  ###
+  ##############################################################################
+  ##############################################################################
+
 
-  def getSizeSummation(self, problemIndices):
-    sizeSummation = 1
-    for i in range(self.problemType["NumIndicesC"], \
-        self.problemType["TotalIndices"]):
-      sizeSummation *= self.problemIndexToSize[i][problemIndices[i]]
-    return sizeSummation
 
   ##############################################################################
-  # Print Data
+  # Print2D
+  def print2D(self, indices ):
+    indicesIdx = 0
+    problemIndices = []
+    for i in range(0, self.numIndices):
+      if i == self.idx0:
+        problemIndices.append(-1)
+      elif i == self.idx1:
+        problemIndices.append(-1)
+      else:
+        problemIndices.append(indices[indicesIdx])
+        indicesIdx += 1
+
+    winnerIndices = []
+    w = "winner"
+    g = "gflops"
+    f = "faster"
+    s = "second"
+    sss = []
+    for sIdx in range(0, self.numSolutions):
+      sss.append("Sol[%u]" % sIdx)
+    for j in range(0, self.numProblemSizes[1]):
+      w += ",%4u" % self.problemIndexToSize[1][j]
+      g += ",%4u" % self.problemIndexToSize[1][j]
+      f += ",%4u" % self.problemIndexToSize[1][j]
+      s += ",%4u" % self.problemIndexToSize[1][j]
+      for sIdx in range(0, self.numSolutions):
+        sss[sIdx] += ",%4u" % self.problemIndexToSize[1][j]
+    w += "\n"
+    g += "\n"
+    f += "\n"
+    s += "\n"
+    for sIdx in range(0, self.numSolutions):
+      sss[sIdx] += "\n"
+    for i in range(0, self.numProblemSizes[0]):
+      problemIndices[self.idx0] = i
+      w += "%4u" % self.problemIndexToSize[0][i]
+      g += "%4u" % self.problemIndexToSize[0][i]
+      f += "%4u" % self.problemIndexToSize[0][i]
+      s += "%4u" % self.problemIndexToSize[0][i]
+      for sIdx in range(0, self.numSolutions):
+        sss[sIdx] += "%4u" % self.problemIndexToSize[0][i]
+      for j in range(0, self.numProblemSizes[1]):
+        problemIndices[self.idx1] = j
+        problemIdx = self.indicesToSerial(0, problemIndices)
+        for sIdx in range(0, self.numSolutions):
+          sss[sIdx] += ",%f" % self.data[problemIdx+sIdx]
+
+        if self.data[problemIdx+0] > self.data[problemIdx+1]:
+          winnerIdx = 0
+          winnerGFlops = self.data[problemIdx+0]
+          secondIdx = 1
+          secondGFlops = self.data[problemIdx+1]
+        else:
+          winnerIdx = 1
+          winnerGFlops = self.data[problemIdx+1]
+          secondIdx = 0
+          secondGFlops = self.data[problemIdx+0]
+        for solutionIdx in range(2, self.numSolutions):
+          solutionSerialIdx = problemIdx + solutionIdx
+          solutionGFlops = self.data[solutionSerialIdx]
+          if solutionGFlops > winnerGFlops:
+            #print "%f > %f" % (solutionGFlops, winnerGFlops)
+            secondIdx = winnerIdx
+            secondGFlops = winnerGFlops
+            winnerIdx = solutionIdx
+            winnerGFlops = solutionGFlops
+
+
+        if winnerIdx not in winnerIndices:
+          winnerIndices.append(winnerIdx)
+        w += ",%4u" % winnerIdx
+        g += ",%f" % winnerGFlops
+        f += ",%f" % (winnerGFlops/secondGFlops)
+        s += ",%4u" % (secondIdx)
+      w += "\n"
+      g += "\n"
+      f += "\n"
+      s += "\n"
+      for sIdx in range(0, self.numSolutions):
+        sss[sIdx] += "\n"
+
+    w += "\n\n"
+    g += "\n\n"
+    f += "\n\n"
+    s += "\n\n"
+    for sIdx in range(0, self.numSolutions):
+      sss[sIdx] += "\n\n"
+    w += "Winners:\n"
+    for winnerIdx in winnerIndices:
+      w += "%4u, %s, %s\n" % (winnerIdx, self.solutionTiles[winnerIdx], self.solutionNames[winnerIdx])
+
+    printFileName = "Winner2D"
+    for idx in indices:
+      printFileName += "_%u" % idx
+    printFileName += ".csv"
+    printFile = open(os.path.join(globalParameters["WorkingPath"], printFileName), "w")
+    printFile.write( w )
+    printFile.write( g )
+    printFile.write( f )
+    printFile.write( s )
+    for sIdx in range(0, self.numSolutions):
+      printFile.write( sss[sIdx] )
+    printFile.close()
+
+
+  ##############################################################################
+  # Least Important Solution
+  def leastImportantSolution(self):
+    solutionImportance = []
+    for i in range(0, self.numSolutions):
+      solutionImportance.append([i, 0, 0, 0])
+    problemSizes = [0]*self.numIndices
+    totalSavedMs = 0
+    totalExecMs = 0
+    totalWins = 0
+    for problemIndices in self.problemIndicesForGlobalRange:
+      for i in range(0, self.numIndices):
+        problemSizes[i] = self.problemIndexToSize[i][problemIndices[i]]
+      totalFlops = self.flopsPerMac
+      for size in problemSizes:
+        totalFlops *= size
+
+      problemIdx = self.indicesToSerial(0, problemIndices)
+      if self.data[problemIdx+0] > self.data[problemIdx+1]:
+        winnerIdx = 0
+        winnerGFlops = self.data[problemIdx+0]
+        secondIdx = 1
+        secondGFlops = self.data[problemIdx+1]
+      else:
+        winnerIdx = 1
+        winnerGFlops = self.data[problemIdx+1]
+        secondIdx = 0
+        secondGFlops = self.data[problemIdx+0]
+
+      for solutionIdx in range(2, self.numSolutions):
+        solutionSerialIdx = problemIdx + solutionIdx
+        solutionGFlops = self.data[solutionSerialIdx]
+        if solutionGFlops > winnerGFlops:
+          secondIdx = winnerIdx
+          secondGFlops = winnerGFlops
+          winnerIdx = solutionIdx
+          winnerGFlops = solutionGFlops
+      winnerTimeMs = totalFlops / winnerGFlops / 1000000
+      secondTimeMs = totalFlops / secondGFlops / 1000000
+      solutionImportance[winnerIdx][1] += (secondTimeMs - winnerTimeMs)
+      solutionImportance[winnerIdx][2] += 1
+      solutionImportance[winnerIdx][3] += winnerTimeMs
+
+      totalSavedMs += secondTimeMs - winnerTimeMs
+      totalExecMs += winnerTimeMs
+      totalWins += 1
+    solutionImportance.sort(key=lambda x: x[1])
+    return ( solutionImportance[0][0], \
+        solutionImportance[0][1] / totalSavedMs, \
+        solutionImportance[0][2] / totalWins, \
+        solutionImportance[0][3] / totalExecMs )
+
+
+  ##############################################################################
+  # Score Range For Logic
+  def scoreRangeForLogic(self, indexRange, logic):
+    pass
+
+  ##############################################################################
+  # Score (microseconds) Range For Solutions
+  def scoreRangeForSolutions(self, indexRange):
+    scores = [0]*self.numSolutions
+    for problemIndices in self.problemIndicesForRange(indexRange):
+      problemSerial = self.indicesToSerial(0, problemIndices)
+      totalFlops = self.totalFlopsForProblemIndices(problemIndices)
+      for solutionIdx in range(0, self.numSolutions):
+        gflops = self.data[problemSerial+solutionIdx]
+        timeUs = totalFlops / gflops / 1000
+        scores[solutionIdx] += timeUs
+    return scores
+
+  ##############################################################################
+  # Total Flops For Problem Indices
+  def totalFlopsForProblemIndices(self, problemIndices):
+    totalFlops = self.flopsPerMac
+    for i in range(0, self.numIndices):
+      totalFlops *= self.problemIndexToSize[i][problemIndices[i]]
+    return totalFlops
+
   ##############################################################################
+  # Remove Solution
+  def removeSolution(self, removeSolutionIdx):
+
+    # temporarily move current to old
+    oldSolutions = self.solutions
+    oldNumSolutions = self.numSolutions
+    oldData = self.data
+    oldTotalSize = self.totalSize
+
+    # update solutions
+    self.solutions = []
+    for i in range(0, oldNumSolutions):
+      if i != removeSolutionIdx:
+        self.solutions.append(oldSolutions[i])
+    self.solutionMinNaming = Solution.getMinNaming(self.solutions)
+    self.solutionNames = []
+    self.solutionTiles = []
+    for solution in self.solutions:
+      self.solutionNames.append(Solution.getNameMin(solution, \
+          self.solutionMinNaming))
+      self.solutionTiles.append("%ux%u"%(solution["MacroTile0"], \
+          solution["MacroTile1"]))
+    self.numSolutions = len(self.solutions)
+
+    # update data
+    self.totalSize = self.totalProblems * self.numSolutions
+    self.data = array.array('f', [0]*self.totalSize)
+    for problemIdx in range(0, self.totalProblems):
+      newSolutionIdx = 0
+      for oldSolutionIdx in range(0, oldNumSolutions):
+        if oldSolutionIdx != removeSolutionIdx:
+          self.data[problemIdx*self.numSolutions+newSolutionIdx] \
+              = oldData[problemIdx*oldNumSolutions+oldSolutionIdx]
+          newSolutionIdx += 1
+
+  ##############################################################################
+  # Recommended Index Order
+  # TODO, this may depend on transposes
+  def recommendedIndexOrder(self):
+    order = []
+    for i in range(0, self.numIndices):
+      if i != self.idxU and i != self.idx1 and i != self.idx0:
+        order.append(i)
+    order.append(self.idxU)
+    order.append(self.idx0)
+    order.append(self.idx1)
+    return order
+
+  ##############################################################################
+  # Print Data
   def printData(self):
     print2("serial; idxD0, idxD1, idxDU, idxOthers; sizeD0, sizeD1, sizeDU, sizeOthers; sol0, sol1, sol2, ...")
     indices = [0]*self.numIndices
@@ -557,26 +1291,83 @@ def printData(self):
           indices[i] += 1
 
   ##############################################################################
-  # Get Item
+  # Problem Indices For Range
+  def problemIndicesForRange(self, indexRange):
+    problemIndexList = []
+    problemIndices = []
+    for idx in indexRange:
+      problemIndices.append(idx[0])
+    moreProblems = True
+    while moreProblems:
+      problemIndexList.append(deepcopy(problemIndices))
+      # next problem
+      problemIndices[0] += 1
+      for i in range(0, self.numIndices):
+        if problemIndices[i] >= indexRange[i][1]:
+          if i == self.numIndices-1:
+            moreProblems = False
+            break
+          else:
+            problemIndices[i] = indexRange[i][0]
+            problemIndices[i+1] += 1
+        else:
+          break
+    return problemIndexList
+
   ##############################################################################
+  # Get Winner For Problem
+  def getWinnerForProblem(self, problemIndices):
+    problemIdx = self.indicesToSerial(0, problemIndices)
+    winnerIdx = -1
+    winnerGFlops = -1
+    for solutionIdx in range(0, self.numSolutions):
+      solutionSerialIdx = problemIdx + solutionIdx
+      solutionGFlops = self.data[solutionSerialIdx]
+      if solutionGFlops > winnerGFlops:
+        #print "%f > %f" % (solutionGFlops, winnerGFlops)
+        winnerIdx = solutionIdx
+        winnerGFlops = solutionGFlops
+    return (winnerIdx, winnerGFlops)
+
+
+  ##############################################################################
+  # Get Size Free
+  def getSizeFree(self, problemIndices):
+    sizeFree = 1
+    for i in self.rangeIndicesFree:
+      sizeFree *= self.problemIndexToSize[i][problemIndices[i]]
+    return sizeFree
+
+
+  ##############################################################################
+  # Get Size Summation
+  def getSizeSummation(self, problemIndices):
+    sizeSummation = 1
+    for i in self.rangeIndicesSummation:
+      sizeSummation *= self.problemIndexToSize[i][problemIndices[i]]
+    return sizeSummation
+
+
+  ##############################################################################
+  # Get Item
   def __getitem__(self, indexTuple):
     indices = indexTuple[0] # in analysis order
     solutionIdx = indexTuple[1]
     serial = self.indicesToSerial(solutionIdx, indices)
     return self.data[serial]
 
+
   ##############################################################################
-  # Get Item
-  ##############################################################################
+  # Set Item
   def __setitem__(self, indexTuple, value):
     indices = indexTuple[0] # in analysis order
     solutionIdx = indexTuple[1]
     serial = self.indicesToSerial(solutionIdx, indices )
     self.data[serial] = value
 
+
   ##############################################################################
   # Indices -> Serial
-  ##############################################################################
   def indicesToSerial(self, solutionIdx, indices ):
     serial = 0
     stride = 1
@@ -588,8 +1379,13 @@ def indicesToSerial(self, solutionIdx, indices ):
     return serial
 
 
+
+################################################################################
+################################################################################
+###
+###   Main
+###
 ################################################################################
-# Main
 ################################################################################
 def main(  config ):
   print2("# LibraryLogic config: %s" % config)
diff --git a/Tensile/SolutionStructs.py b/Tensile/SolutionStructs.py
index 7dc27a644..795c32022 100644
--- a/Tensile/SolutionStructs.py
+++ b/Tensile/SolutionStructs.py
@@ -123,6 +123,8 @@ def numRegisters( self ):
     return self.properties[self.value][self.idxReg]
   def numBytes( self ):
     return self.numRegisters() * 4
+  def flopsPerMac(self):
+    return 1 if self.isReal() else 8
 
   def __str__(self):
     return self.toChar()

From 35ba76651df0896e44502b5c53fd908e0399515e Mon Sep 17 00:00:00 2001
From: David Tanner <guacamoleo@gmail.com>
Date: Wed, 1 Mar 2017 15:59:02 -0600
Subject: [PATCH 13/21] recursive logic runs to completion

---
 Tensile/Common.py       |   3 +
 Tensile/LibraryLogic.py | 921 +++++++++-------------------------------
 2 files changed, 206 insertions(+), 718 deletions(-)

diff --git a/Tensile/Common.py b/Tensile/Common.py
index 557690f79..9b04526c0 100644
--- a/Tensile/Common.py
+++ b/Tensile/Common.py
@@ -152,6 +152,9 @@
 # Default Analysis Parameters
 ################################################################################
 defaultAnalysisParameters = {
+    "InitialSolutionWindow":     4,
+    "BranchWeight":              100, # microseconds / kernel
+
     "Dilation":                 3,
     "Threshold":                0.1,
     "OutlierThreshold":         0,
diff --git a/Tensile/LibraryLogic.py b/Tensile/LibraryLogic.py
index 72b8d60cb..9dfeba30b 100644
--- a/Tensile/LibraryLogic.py
+++ b/Tensile/LibraryLogic.py
@@ -240,9 +240,9 @@ def removeInvalidSolutions(self):
       moreProblems = True
       invalidIdx = -1
       for problemIndices in self.problemIndicesForGlobalRange:
-        problemIdx = self.indicesToSerial(0, problemIndices)
+        problemSerial = self.indicesToSerial(0, problemIndices)
         for solutionIdx in range(0, self.numSolutions):
-          gflops = self.data[problemIdx+solutionIdx]
+          gflops = self.data[problemSerial+solutionIdx]
           if gflops == 0:
             invalidIdx = solutionIdx
             break
@@ -276,10 +276,10 @@ def smooth(self):
     outlierThreshold = self.parameters["OutlierThreshold"]
     problemSizes = [0]*self.numIndices
     for problemIndices in self.problemIndicesForGlobalRange:
-      problemIdx = self.indicesToSerial(0, problemIndices)
+      problemSerial = self.indicesToSerial(0, problemIndices)
 
       for solutionIdx in range(0, self.numSolutions):
-        gflops = self.data[problemIdx+solutionIdx]
+        gflops = self.data[problemSerial+solutionIdx]
         neighborGFlops = []
         smoothProblem = False
         for iIdx in range(0, self.numIndices):
@@ -304,132 +304,107 @@ def smooth(self):
             problemSizes[i] = self.problemIndexToSize[i][problemIndices[i]]
             s += "%u, " % problemSizes[i]
           new = sum(neighborGFlops)/len(neighborGFlops)
-          old = self.data[problemIdx+solutionIdx]
+          old = self.data[problemSerial+solutionIdx]
           s += "%f -> %f" % (old, new)
           print s
-          self.data[problemIdx+solutionIdx] \
+          self.data[problemSerial+solutionIdx] \
               = sum(neighborGFlops)/len(neighborGFlops)
 
 
   ##############################################################################
   # ENTRY: En Rule
   # currentIndexIndex = 0, 1, 2, 3...
+  # currentIndexRange will have only 1 size for prior indices (unless initial)
+  #
+  # Rule:
+  # [128, [
+  #         [64, [
+  #                [16, 0],
+  #                [2880,1]
+  #              ]
+  #         ],
+  #         [96, [
+  #                [16, 0],
+  #                [64, 1]
+  #              ]
+  #         ]
+  #       ]
+  # ], another
+  #
+  #
   ##############################################################################
   def enRule(self, currentIndexIndex, currentIndexRange):
+    tab = ""
+    for i in range(0, currentIndexIndex):
+      tab += "  "
+    print "%senRule(%u, %s)" % (tab, currentIndexIndex, currentIndexRange)
     currentIndex = self.indexOrder[currentIndexIndex]
-    lastIndex = currentIndexIndex == self.numIndices-1
+    nextIndexIndex = currentIndexIndex+1
+    nextIndexRange = deepcopy(currentIndexRange)
+    isLastIndex = currentIndexIndex == self.numIndices-1
 
     # if there's only 1 problem size here
     if currentIndexRange[currentIndex][1] \
         - currentIndexRange[currentIndex][0] == 1:
+
       # this is last index, so just return fastest solution
-      if lastIndex:
-        scores = scoreRangeForSolutions(currentIndexRange)
-        winnerIdx = 0
-        for solutionIdx in range(1, self.numSolution):
-          if scores[solutionIdx] < scores[winnerIdx]:
-            winnerIdx = solutionIdx
-        rule = [ -1, winnerIdx ]
+      if isLastIndex:
+        # optimize b/c this should be only single problem
+        #scores = self.scoreRangeForSolutions(currentIndexRange)
+        #winnerIdx = 0
+        #for solutionIdx in range(1, self.numSolution):
+        #  if scores[solutionIdx] < scores[winnerIdx]:
+        #    winnerIdx = solutionIdx
+        winnerIdx = self.winnerForRange(currentIndexRange)
+        print "%s  returning early winner=%u" % (tab, winnerIdx)
+        return [ -1, winnerIdx ]
+
       # this isn't last index, so just return next index
       else:
-        newIndexIndex = currentIndexIndex+1
-        newIndexRange = deepcopy(currentIndexRange)
-        rule = [ -1, self.enRule(newIndexIndex, newIndexRange) ]
-
-    # create rule for smallest size
-
-    # for all problem indices in this index
-    for problemIndex in range(currentIndexRange[currentIndex][0], \
-        currentIndexRange[currentIndex][1]):
-    # rules = seed with smallest rule
-    # for dimIdx = 0 -> numSizes
-      # if newRule
-        # score range using newRule
-        # score range using priorRule
-        # accept/reject based on score
-    # current index is dimOrder[0]
-
-
-
+        print "%s  returning early enRule(%u,%s)" \
+            % (tab, nextIndexIndex, nextIndexRange)
+        return [ -1, self.enRule(nextIndexIndex, nextIndexRange) ]
 
+    # ruleList
+    ruleList = []
 
-    sumValues = []
-    totalSummationSizes = 1
-    for i in self.rangeIndicesSummation:
-      totalSummationSizes *= self.numProblemSizes[i]
-    summationPermutations = []
-    for permutationIdx in range(0, totalSummationSizes):
-      permutation = []
-      permutationSize = 1
-      pIdx = permutationIdx
-      for i in self.rangeIndicesSummation:
-        idx = pIdx % self.numProblemSizes[i]
-        permutation.append(idx)
-        permutationSize *= self.problemIndexToSize[i][idx]
-        pIdx /= self.numProblemSizes[i]
-      # insert permutation in sorted order
-      insertIdx = len(summationPermutations)-1
-      for pIdx in range(0, len(summationPermutations)):
-        size = 1
-        for i in self.rangeIndicesSummation:
-          size *= self.problemIndexToSize[i][summationPermutations[pIdx][i]]
-        if permutationSize > size:
-          insertIdx = pIdx
-          break
-      summationPermutations.insert(insertIdx, permutation)
-    print "SummationPermutations:", summationPermutations
-
+    # create rule for smallest size
+    initialSize = min(currentIndexRange[currentIndex][0] \
+        + self.parameters["InitialSolutionWindow"], \
+        self.numProblemSizes[currentIndex])
+    nextIndexRange[currentIndex][1] = initialSize
+    if isLastIndex:
+      winnerIdx = self.winnerForRange(nextIndexRange)
+      initialRule = [ currentIndexRange[currentIndex][0], winnerIdx]
 
-    if len(summationPermutations) == 1:
-      rules = [ 0, self.createRules01(summationPermutations[0]) ]
-      return rules
     else:
-      printExit("No Logic to support multiple summation sizes.")
-      # iterate over summation permutations
-# for each serial pair, scoreA, scoreB, scoreAB
-# keep rule AB if scoreAB isn't much slower than scoreA + scoreB
+      initialRule = [ currentIndexRange[currentIndex][0], \
+          self.enRule(nextIndexIndex, nextIndexRange) ]
+    ruleList.append(initialRule)
 
-    """
-    sizeSummation *= self.problemIndexToSize[i][problemIndices[i]]
+    # for all problem indices in this index
 
-    firstProblemIndices = []
-    lastProblemIndices = []
-    for i in range(0, self.numIndices):
-      firstProblemIndices.append(0)
-      lastProblemIndices.append(self.numProblems[i]-1)
-    minSumValue = self.getSizeSummation(firstProblemIndices)
-    maxSumValue = self.getSizeSummation(lastProblemIndices)
-    numSumValues =
-
-
-    rule = [
-        [
-          minU,                             # k threshold
-          [[min01,s], [0,s]],               # diagonals
-          [0, max0, [[min1,s], [min1,s]]],  # skinny0's
-          [1, max1, [[min0,s], [min0,s]]],  # skinny1's
-        ],
-        [
-          minU,                             # k threshold
-          [[min01,s], [0,s]],               # diagonals
-          [0, max0, [[min1,s], [min1,s]]],  # skinny0's
-          [1, max1, [[min0,s], [min0,s]]],  # skinny1's
-        ],
-    ]
-
-    ruleA = createRules01()
-    ruleB = createRules01()
-
-    minSumValue = 0
-    maxSumValue = self.numProblems
+    for problemIndex in range(currentIndexRange[currentIndex][0], \
+        currentIndexRange[currentIndex][1]):
+      print "%s  pIdx: %u" % (tab, problemIndex)
+      nextIndexRange[currentIndex][0] = problemIndex
+      nextIndexRange[currentIndex][1] = problemIndex+1
 
+      if isLastIndex:
+        winnerIdx = self.winnerForRange(currentIndexRange)
+        candidateRule = [ currentIndexRange[currentIndex][0], winnerIdx]
+      else:
+        candidateRule = [ problemIndex, self.enRule(nextIndexIndex, \
+            nextIndexRange) ]
+      priorRule = ruleList[len(ruleList)-1]
+      priorRuleScore = self.scoreRangeForLogic(nextIndexRange, priorRule)
+      candidateRuleScore = self.scoreRangeForLogic(nextIndexRange, \
+          candidateRule)
+      candidateRuleScore += self.parameters["BranchWeight"] # penalize
+      if candidateRuleScore < priorRuleScore:
+        ruleList.append(candidateRule)
 
-    sizeSummation = 1
-    for i in range(self.problemType["NumIndicesC"], \
-        self.problemType["TotalIndices"]):
-      sizeSummation *= self.problemIndexToSize[i][problemIndices[i]]
-    return sizeSummation
-    """
+    return ruleList
 
 
 
@@ -443,585 +418,6 @@ def enRule(self, currentIndexIndex, currentIndexRange):
 
 
 
-  ##############################################################################
-  # Create Rules dim0 / dim1
-  ##############################################################################
-  def createRules01(self, problemSizeSummation ):
-
-    diagonalRules = self.createRulesDiagonal(problemSizeSummation)
-
-
-  ##############################################################################
-  # Create Rules Diagonal
-  ##############################################################################
-  def createRulesDiagonal(self, problemSizeSummation):
-    thresholdForDiagonality = 1.5 # slightly fewer problems than 2
-    numProblemSizesFastestDiagonal = 16
-    problemIndices = [0]*self.numIndices
-    for i in self.rangeIndicesSummation:
-      problemIndices[i] = problemSizeSummation[i \
-          - self.problemType["NumIndicesC"]]
-    print2("\nDiagonalRules for %s" % problemIndices)
-    problemSizes = [0]*self.numIndices
-    totalFlopsPerSizeFree = self.flopsPerMac
-    for i in self.rangeIndicesSummation:
-      totalFlopsPerSizeFree *= self.problemIndexToSize[i][problemIndices[i]]
-    print "totalFlopsPerSizeFree", totalFlopsPerSizeFree
-
-    ########################################
-    # transform data into serial list of "diagonal problem sizes"
-    diagonalData = []
-    moreProblems = True
-    while moreProblems:
-
-      # size free
-      for i in range(0, self.numIndices):
-        problemSizes[i] = self.problemIndexToSize[i][problemIndices[i]]
-      size0 = problemSizes[self.idx0]
-      size1 = problemSizes[self.idx1]
-
-      # if diagonal
-      if size0 < size1*thresholdForDiagonality \
-          and size1 < size0*thresholdForDiagonality:
-        sizeFree = self.getSizeFree(problemIndices)
-
-        problemIdx = self.indicesToSerial(0, problemIndices)
-        solutionGFlops = []
-        for i in range(0, self.numSolutions):
-          solutionGFlops.append(self.data[problemIdx+i])
-
-        diagonalData.append([ sizeFree, solutionGFlops ])
-
-      # next problem
-      problemIndices[0] += 1
-      for i in self.rangeIndicesFree:
-        if problemIndices[i] >= self.numProblemSizes[i]:
-          if i == self.problemType["NumIndicesFree"]-1:
-            moreProblems = False
-            break
-          else:
-            problemIndices[i] = 0
-            problemIndices[i+1] += 1
-        else:
-          break
-
-    diagonalData.sort(key=lambda x: x[0], reverse=True)
-    for dd in diagonalData:
-      print "DD[%u]: %s" % (dd[0], dd[1])
-    print len(diagonalData)
-
-
-    ########################################
-    # create first rule
-    sizeFree = diagonalData[0][0]
-    relativeTime = [0]*self.numSolutions
-    for i in range(0, numProblemSizesFastestDiagonal):
-      for j in range(0, self.numSolutions):
-        gflops = diagonalData[i][1][j]
-        relativeTime[j] += 1 / gflops
-    winnerIdx = 0
-    winnerRelativeTime = relativeTime[0]
-    for i in range(1, self.numSolutions):
-      if relativeTime[i] < winnerRelativeTime:
-        winnerIdx = i
-        winnerRelativeTime = relativeTime[i]
-    print "FastestDiagonalSolution:", winnerIdx, self.solutionNames[winnerIdx]
-    fastestGFlops = 0
-    for i in range(0, numProblemSizesFastestDiagonal):
-      gflops = diagonalData[i][1][winnerIdx]
-      if gflops > fastestGFlops:
-        fastestGFlops = gflops
-
-    rules = []
-    #                                  minGFlops      maxGFlops      oldGFlops?
-    rules.append([winnerIdx, sizeFree, fastestGFlops, fastestGFlops, -1])
-    print "Winner[%3u]: %u" % (0, winnerIdx)
-# we can't just pay attention to single winner
-# we need to compute scores for all solutions over a window
-# b/c 441115111333
-#   = 441555555333
-#
-# we can do a smoothing pass to get rid of bogus data; if a data point is more than x% slower than 4 surrounding points, than its bogus, just set it equal to average of 4 surrounding points
-#
-
-    ########################################
-    # create subsequent rules for smaller sizes
-    for diagonalDataIdx in range(1, len(diagonalData)):
-      print "DiagonalDataIdx:", diagonalDataIdx
-      # prior rule
-      priorRule = rules[len(rules)-1]
-      priorWinnerIdx = priorRule[0]
-      # candidate winner
-      candidateWinnerIdx = 0
-      candidateWinnerGFlops = diagonalData[diagonalDataIdx][1][0]
-      for j in range(1, self.numSolutions):
-        gflops = diagonalData[diagonalDataIdx][1][j]
-        if gflops > candidateWinnerGFlops:
-          candidateWinnerIdx = j
-          candidateWinnerGFlops = gflops
-      if candidateWinnerIdx == priorWinnerIdx:
-        # update prior rule to include this sizeFree
-        rules[len(rules)-1][1] = diagonalData[diagonalDataIdx][0] # size free
-        rules[len(rules)-1][2] = \
-            diagonalData[diagonalDataIdx][1][priorWinnerIdx] # perf at size
-        continue
-      else:
-        # candidate rule
-        sizeFree = diagonalData[diagonalDataIdx][0]
-        totalFlops = sizeFree*totalFlopsPerSizeFree
-        candidateGFlops = diagonalData[diagonalDataIdx][1][candidateWinnerIdx]
-        priorGFlops = diagonalData[diagonalDataIdx][1][priorWinnerIdx]
-        candidateRule = [ candidateWinnerIdx, sizeFree, candidateGFlops, \
-            candidateGFlops, -1 ]
-        # candidate and prior scores
-        candidateTimeUs = totalFlops / candidateGFlops / 1000
-        priorTimeUs = totalFlops / priorGFlops / 1000
-        candidateScore = 1*self.w2 + candidateTimeUs
-        priorScore = 0*self.w2 + priorTimeUs
-        print "DDI[%3u] Prior[%2u]: %.0fus vs Candi[%2u]: %.0fus" \
-            % (diagonalDataIdx, priorWinnerIdx, priorScore, candidateWinnerIdx, candidateScore)
-        checkMoreProblems = True
-        for newDiagonalDataIdx in range(diagonalDataIdx+1, len(diagonalData)):
-          newWinnerIdx = 0
-          newWinnerGFlops = diagonalData[newDiagonalDataIdx][1][0]
-          for j in range(1, self.numSolutions):
-            gflops = diagonalData[newDiagonalDataIdx][1][j]
-            if gflops > newWinnerGFlops:
-              newWinnerIdx = j
-              newWinnerGFlops = gflops
-          # update candidate and prior scores
-          sizeFree = diagonalData[newDiagonalDataIdx][0]
-          totalFlops = sizeFree*totalFlopsPerSizeFree
-          candidateGFlops = \
-              diagonalData[newDiagonalDataIdx][1][candidateWinnerIdx]
-          priorGFlops = diagonalData[newDiagonalDataIdx][1][priorWinnerIdx]
-          candidateTimeUs = totalFlops / candidateGFlops / 1000
-          priorTimeUs = totalFlops / priorGFlops / 1000
-          candidateScore += candidateTimeUs
-          priorScore += priorTimeUs
-          print "  NDDI[%3u] Prior[%2u]: %.0fus vs Candi[%2u]: %.0fus" \
-              % (newDiagonalDataIdx, priorWinnerIdx, priorScore, \
-              candidateWinnerIdx, candidateScore)
-          if newWinnerIdx == candidateWinnerIdx:
-            print "    newWinnerIdx == candidateWinnerIdx"
-            if candidateScore < priorScore:
-              # candidate rule accepted
-              rules.append(candidateRule)
-              print "      accepting"
-              break
-            else:
-              # candidate rule not yet accepted
-              candidateRule[1] = sizeFree
-              candidateRule[2] = candidateGFlops
-              print "      continuing"
-              continue
-          elif newWinnerIdx == priorWinnerIdx:
-            print "    newWinnerIdx == priorWinnerIdx"
-            # returned to original winner, decide now to accept/reject
-            if candidateScore < priorScore:
-              # candidate rule accepted
-              rules.append(candidateRule)
-              print "      accepting"
-              break
-            else:
-              # candidate rule rejected; update prior, continue at newSize
-              rules[len(rules)-1][1] = sizeFree
-              rules[len(rules)-1][2] = priorGFlops
-              diagonalDataIdx = newDiagonalDataIdx
-              print "      rejecting"
-              break
-          else:
-            print "    newWinnerIdx is %u" % newWinnerIdx
-            # new winner was a 3rd solution; decide now (same as above)
-            if candidateScore < priorScore:
-              # candidate rule accepted
-              rules.append(candidateRule)
-              print "      accepting"
-              break
-            else:
-              # candidate rule rejected; update prior, continue at newSize
-              rules[len(rules)-1][1] = diagonalData[newDiagonalDataIdx][0]
-              rules[len(rules)-1][2] = \
-                  diagonalData[newDiagonalDataIdx][1][priorWinnerIdx]
-              diagonalDataIdx = newDiagonalDataIdx
-              print "      rejecting"
-              break
-
-      return
-
-        # go farther forward, does candidate rule keep winning, or does priorRule keep winning?
-        # the new rule should start at a loss b/c of Weight2
-        # a few problems in the future
-            # if new rule is better, W2 gets amortized, Wt improves
-            # if new rule is worse, W2 gets amortized, Wt worsens
-        # continue to future problems until, and make final decision
-          # newRule gets better score; accept
-          # return to priorRule winner; accept/reject
-          # Yet a new winner
-            # easy: make final accept/reject including this new problem size
-            # hard: recure?
-          #
-        # is the num problems in future vary with W2,Wt?
-# Wt = 1
-# W2 = 1 means we would rather lose 1us per kernel rather than adding another split (actually they're equal)
-# so, in order for candidate to be accepted immediately, it must improve all kernels by more than 1us, or after 2 sizes, improve by 0.5us per kernel
-#
-#
-# 0 0 1 0 0
-# 0 0 1 1 0
-# 0 0 1 4 0
-# 0 0 1 4 1 0
-#
-
-      print "Winner[%3u]: %u" % (i, winnerIdx)
-
-
-    return
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-    # abstract to multidimensions
-    # what is the diagonal
-    dilation = self.self.parameters["Dilation"]
-    threshold = self.self.parameters["Threshold"]
-    numProblems0 = self.numProblemSizes[self.idx0]
-
-    ############################################################################
-    # determine winner at largest size
-    solutionNumWins = [0]*self.numSolutions
-    solutionGFlops = [0]*self.numSolutions
-    for problemSizeIdx in range(max(0,numProblems0-dilation*2), numProblems0):
-      problemIndices[self.idx0] = problemSizeIdx
-      problemIndices[self.idx1] = problemSizeIdx
-      problemIdx = self.indicesToSerial(0, problemIndices)
-      winnerIdx = -1
-      winnerGFlops = -1
-      for solutionIdx in range(0, self.numSolutions):
-        solutionSerialIdx = problemIdx + solutionIdx
-        solutionTmpGFlops = self.data[solutionSerialIdx]
-        if solutionTmpGFlops > winnerGFlops:
-          winnerIdx = solutionIdx
-          winnerGFlops = solutionTmpGFlops
-        #print "updated winner: ", winnerIdx
-      #print winnerIdx
-      solutionNumWins[winnerIdx] += 1
-      if winnerGFlops > solutionGFlops[winnerIdx]:
-        solutionGFlops[winnerIdx] = winnerGFlops
-    largestWinnerIdx = -1
-    largestWinnerNumWins = -1
-    largestWinnerGFlops = -1
-    #print "FastestWins:"
-    for i in range(0, self.numSolutions):
-      #print "sol[%u] = %u wins @ %.0f GFlops" \
-      #    % (i, solutionNumWins[i], solutionGFlops[i])
-      if solutionNumWins[i] > largestWinnerNumWins:
-        largestWinnerIdx = i
-        largestWinnerNumWins = solutionNumWins[i]
-        largestWinnerGFlops = solutionGFlops[i]
-    #print "Winner at Largest Problem: S[%u] @ %.0f GFlops with %u/%u wins" \
-    #    % (largestWinnerIdx, largestWinnerGFlops, largestWinnerNumWins, \
-    #    dilation*2)
-    problemIndices[self.idx0] = numProblems0-1
-    problemIndices[self.idx1] = numProblems0-1
-    largestWinnerAtLargestProblemIdx = self.indicesToSerial(largestWinnerIdx, \
-        problemIndices)
-    largestWinnerGFlopsAtLargestSize = \
-        self.data[largestWinnerAtLargestProblemIdx]
-
-    ############################################################################
-    # Diagonal Rule
-    # solutionIdx, minSizeThresholdIdx, gflops at minSize, maxGFlops, oldGFlops
-    numRules = 1
-    diagonalRules = [ [largestWinnerIdx, deepcopy(problemIndices), \
-        largestWinnerGFlopsAtLargestSize, largestWinnerGFlops, -1] ]
-
-    ############################################################################
-    # For largest to smallest, determine fastest solution
-    for problemSizeIdx in range(numProblems0-2, -1, -1):
-      problemIndices[self.idx0] = problemSizeIdx
-      problemIndices[self.idx1] = problemSizeIdx
-      problemIdx = self.indicesToSerial(0, problemIndices)
-
-      # current rule winner performance at this problemSizeIdx
-      ruleWinnerIdx = diagonalRules[-1][0]
-      ruleWinnerGFlopsForSize = self.data[problemIdx + ruleWinnerIdx]
-
-      #determine fastest at this problemSizeIdx
-      (winnerForSizeIdx, winnerForSizeGFlops) = \
-          self.getWinnerForProblem( problemIndices )
-
-      # ruleWinner also wins at this problem size (at least by threshold)
-      if winnerForSizeIdx == ruleWinnerIdx \
-          or ruleWinnerGFlopsForSize > (1-threshold)*winnerForSizeGFlops:
-        # just update rule
-        diagonalRules[numRules-1][1] = deepcopy(problemIndices)
-        diagonalRules[numRules-1][2] = ruleWinnerGFlopsForSize
-        diagonalRules[numRules-1][3] = max(diagonalRules[numRules-1][3], \
-            ruleWinnerGFlopsForSize)
-
-      # we have a new candidate winner
-      # only keep it if don't revert back to ruleWinner over next Dilation
-      else:
-
-        # check if we don't revert back to ruleWinner over next Dilation probs
-        revert = False
-        endDilationIdx = max(-1, problemSizeIdx-dilation)
-        for dilationSizeIdx in range(problemSizeIdx-1, \
-            endDilationIdx, -1):
-          problemIndices[self.idx0] = dilationSizeIdx
-          problemIndices[self.idx1] = dilationSizeIdx
-          dilationIdx = self.indicesToSerial(0, problemIndices)
-          ruleWinnerGFlopsForDilation = self.data[dilationIdx \
-              + ruleWinnerIdx]
-          #determine fastest at this problemSizeIdx
-          (winnerForDilationIdx, winnerForDilationGFlops) = \
-              self.getWinnerForProblem(problemIndices)
-
-          # ruleWinner also wins at dilation size (at least by threshold)
-          if winnerForDilationIdx == ruleWinnerIdx \
-              or ruleWinnerGFlopsForDilation \
-              > (1-threshold)*winnerForSizeGFlops:
-            # yes, within Dilation, we've returned to same winner
-            revert = True
-            # so update rule for this size
-            diagonalRules[numRules-1][1] = deepcopy(problemIndices)
-            diagonalRules[numRules-1][2] = winnerForDilationGFlops
-            diagonalRules[numRules-1][3] = max(diagonalRules[numRules-1][3], \
-                winnerForSizeGFlops)
-            # resume outer loop after dilation
-            problemSizeIdx = dilationSizeIdx
-            break
-          else:
-            # different winner at this dilation size
-            # don't need to do anything
-            pass
-
-        # if we never revert to rule during dilation, create new rule
-        if not revert:
-          # solutionIdx, minSizeThresholdIdx, gflops at minSize, maxGFlops, old
-          newRule = [ winnerForSizeIdx, deepcopy(problemIndices), \
-              winnerForSizeGFlops, winnerForSizeGFlops, ruleWinnerGFlopsForSize]
-          diagonalRules.append(newRule)
-          numRules += 1
-          #print "Added new rule: %s" % newRule
-
-    return diagonalRules
-    #end diagonal rules
-
-
-  ##############################################################################
-  # Skinny Solutions
-  ##############################################################################
-  def getSkinnySolutions(self, diagonalRules, problemIndices, \
-      idxLarge, idxSmall):
-    idx0 = self.idx0
-    idx1 = self.idx1
-    #idxU = self.idxU
-    #dilation = self.self.parameters["Dilation"]
-    threshold = self.self.parameters["Threshold"]
-
-    skinnyRules = []
-
-    # for each size threshold along diagonal
-    for diagonalRuleIdx in range(0, len(diagonalRules)):
-      diagonalRule = diagonalRules[diagonalRuleIdx]
-      diagonalRuleWinnerIdx = diagonalRule[0]
-      diagonalRuleThresholdProblem = diagonalRule[1]
-      #diagonalRuleGFlops = diagonalRule[2] # perf at threshold
-      thresholdSizeFree = self.getSizeFree(diagonalRuleThresholdProblem)
-      print2("ThresholdSizeFree[%u][%u]: %u" \
-          % (diagonalRuleThresholdProblem[idx0], \
-          diagonalRuleThresholdProblem[idx1], \
-          thresholdSizeFree))
-
-      # check skinny d0<<d1 (large d0, small d1)
-      skinnyProblemIndices = deepcopy(problemIndices)
-      for sizeIdxSmall in range( diagonalRuleThresholdProblem[idxSmall]-1, -1, -1):
-        skinnyProblemIndices[idxSmall] = sizeIdxSmall
-        for sizeIdxLarge in range( diagonalRuleThresholdProblem[idxLarge], \
-            self.numProblemSizes[idxLarge]):
-          skinnyProblemIndices[idxLarge] = sizeIdxLarge
-
-
-          skinnySizeFree = self.getSizeFree(skinnyProblemIndices)
-          if skinnySizeFree > thresholdSizeFree:
-            #print "SkinnySizeFree[%u][%u]: %u" % (sizeIdxSmall, sizeIdxLarge, \
-            #  skinnySizeFree)
-
-            # rule winner's performance at this skinnyness
-            skinnyProblemIdx = self.indicesToSerial(0, skinnyProblemIndices)
-            diagonalWinnerGFlopsForSkinny = self.data[skinnyProblemIdx \
-                + diagonalRuleWinnerIdx]
-
-            # which solution wins here?
-            (winnerIdx, winnerGFlops) = \
-                self.getWinnerForProblem(skinnyProblemIndices)
-            #print winnerIdx, winnerGFlops
-            if winnerIdx == diagonalRuleWinnerIdx \
-                or diagonalWinnerGFlopsForSkinny > (1-threshold)*winnerGFlops:
-              # diagonal rule also wins here
-              print2("if dS <%5u and dL >%5u diagnl S[%2u] %5.0f == S[%2u] %5.0f GFlops" \
-                  % (self.problemIndexToSize[idxSmall][sizeIdxSmall], \
-                  self.problemIndexToSize[idxLarge][sizeIdxLarge], \
-                  winnerIdx, winnerGFlops, diagonalRuleWinnerIdx, \
-                  diagonalWinnerGFlopsForSkinny ))
-              pass
-            else:
-              # we're so skinny that diagonal rule no longer applies
-              print2("if dS <%5u and dL >%5u skinny S[%2u] %5.0f >> S[%2u] %5.0f GFlops" \
-                  % (self.problemIndexToSize[idxSmall][sizeIdxSmall], \
-                  self.problemIndexToSize[idxLarge][sizeIdxLarge], \
-                  winnerIdx, winnerGFlops, diagonalRuleWinnerIdx, \
-                  diagonalWinnerGFlopsForSkinny ))
-              skinnyRule = [deepcopy(skinnyProblemIndices), winnerIdx, \
-                  winnerGFlops]
-              skinnyRules.append(skinnyRule)
-              # TODO need to use dilate parameter to make sure we've switched
-              # TODO data along this size may not agree with
-              #   data along different sizes (but perhaps it should
-              # TODO need extra loop here, to iterate idxSmall to
-              # smaller sizes to see if the solution changes further
-
-            # does the diagonalRuleWinner also win here?
-            break # only check the problem size closest to ruleSize
-
-    return skinnyRules
-    # end skinny solutions
-
-
-  ##############################################################################
-  # Determine Logic Along U
-  ##############################################################################
-  def determineLogicAlongU(self):
-    globalRange = []
-    for i in range(0, self.numIndices):
-      globalRange.append( [0, self.numProblemSizes[i]] )
-
-
-
-
-    self.print2D([0, 0])
-
-    ############################################################################
-    # Determine Solutions Along Diagonal
-    # roughly same splitting regardless of sizeU
-    problemIndices = []
-    for numProblemsForIndex in self.numProblemSizes:
-      problemIndices.append(numProblemsForIndex-1)
-    print problemIndices
-    self.diagonalRules = self.getFastestSolutionsAlongDiagonal(problemIndices)
-    if True:
-      print2("Diagonal Rules:")
-      for rule in self.diagonalRules:
-        string = "  if freeSize >=%4u" % self.problemIndexToSize[0][rule[1][0]]
-        for i in range(1, self.numIndices):
-          string += "x%4u" % self.problemIndexToSize[i][rule[1][i]]
-        string += " return S[%u] @ %5.0f-%5.0f>%5.0f GFlops is %s" \
-            % (rule[0], rule[2], rule[3], rule[4], \
-            self.solutionNames[rule[0]])
-        print2(string)
-
-    ############################################################################
-    # Determine Skinny0 Solutions
-    skinnyRules01 = self.getSkinnySolutions(self.diagonalRules, problemIndices, \
-        self.idx0, self.idx1)
-    #print "Skinny Rules:"
-    #for rule in skinnyRules01:
-    #  string = "  if freeSize >=%4u" % data.problemIndexToSize[0][rule[1][0]]
-    #  for i in range(1, data.numIndices):
-    #    string += "x%4u" % data.problemIndexToSize[i][rule[1][i]]
-    #  string += " return S[%u] @ %5.0f-%5.0f>%5.0f GFlops is %s" \
-    #      % (rule[0], rule[2], rule[3], rule[4], \
-    #      data.solutionNames[rule[0]])
-
-    ############################################################################
-    # Determine Skinny1 Solutions
-    skinnyRules10 = self.getSkinnySolutions(self.diagonalRules, problemIndices, \
-        self.idx1, self.idx0)
-
-    # list solutions that actually get used
-    solutionIndicesUsed = []
-    for rule in skinnyRules01:
-      pass
-    for rule in skinnyRules10:
-      pass
-    for rule in self.diagonalRules:
-      solutionIdx = rule[0]
-      solution = self.solutions[solutionIdx]
-      MT0 = solution["MacroTile0"]
-      MT1 = solution["MacroTile1"]
-      DU = solution["DepthU"]
-      #print "Rule Tile S[%u]: %ux%ux%u" % (solutionIdx, MT0, MT1, DU)
-      # is this solution in the list
-      inList = False
-      for solutionUsed in solutionIndicesUsed:
-        if solutionUsed[0] == solutionIdx:
-          inList = True
-          break
-      if not inList:
-        insertIdx = len(solutionIndicesUsed)
-        for i in range(0, len(solutionIndicesUsed)):
-          iMT0 = solutionIndicesUsed[i][1]
-          iMT1 = solutionIndicesUsed[i][2]
-          iDU  = solutionIndicesUsed[i][3]
-          #print "  compare S[%u]: %ux%ux%u" % (solutionIndicesUsed[i][0], \
-          #    iMT0, iMT1, iDU)
-          if MT0*MT1 < iMT0*iMT1:
-            insertIdx = i
-            break
-          elif MT0*MT1 > iMT0*iMT1:
-            continue
-          else: # MT == MT
-            if DU < iDU:
-              insertIdx = i
-              break
-            else:
-              continue
-
-          # if i'm smaller than i, insert me before i
-        #print "insert: %u" % insertIdx
-        solutionIndicesUsed.insert(insertIdx, [solutionIdx, MT0, MT1, DU])
-    #print solutionIndicesUsed
-
-    # list of solutions used
-    self.solutionsUsed = []
-    for solutionIndexUsed in solutionIndicesUsed:
-      self.solutionsUsed.append(self.solutions[solutionIndexUsed[0]])
-
-    # translate rules to new solution indices
-    for rule in skinnyRules01:
-      pass
-    for rule in skinnyRules10:
-      pass
-    for ruleIdx in range(0, len(self.diagonalRules)):
-      solutionIdx = self.diagonalRules[ruleIdx][0]
-      for i in range(0, len(solutionIndicesUsed)):
-        solutionIndexUsed = solutionIndicesUsed[i]
-        if solutionIdx == solutionIndexUsed[0]:
-          self.diagonalRules[ruleIdx][0] = i
-          break
-      # change problemSizeIndices to sizes
-      for i in range(0, 3):
-        self.diagonalRules[ruleIdx][1][i] = \
-            self.problemIndexToSize[i][ self.diagonalRules[ruleIdx][1][i] ]
-
-    print2("# New Rules: %s" % self.diagonalRules)
-
-
 
   ##############################################################################
   ##############################################################################
@@ -1078,22 +474,22 @@ def print2D(self, indices ):
         sss[sIdx] += "%4u" % self.problemIndexToSize[0][i]
       for j in range(0, self.numProblemSizes[1]):
         problemIndices[self.idx1] = j
-        problemIdx = self.indicesToSerial(0, problemIndices)
+        problemSerial = self.indicesToSerial(0, problemIndices)
         for sIdx in range(0, self.numSolutions):
-          sss[sIdx] += ",%f" % self.data[problemIdx+sIdx]
+          sss[sIdx] += ",%f" % self.data[problemSerial+sIdx]
 
-        if self.data[problemIdx+0] > self.data[problemIdx+1]:
+        if self.data[problemSerial+0] > self.data[problemSerial+1]:
           winnerIdx = 0
-          winnerGFlops = self.data[problemIdx+0]
+          winnerGFlops = self.data[problemSerial+0]
           secondIdx = 1
-          secondGFlops = self.data[problemIdx+1]
+          secondGFlops = self.data[problemSerial+1]
         else:
           winnerIdx = 1
-          winnerGFlops = self.data[problemIdx+1]
+          winnerGFlops = self.data[problemSerial+1]
           secondIdx = 0
-          secondGFlops = self.data[problemIdx+0]
+          secondGFlops = self.data[problemSerial+0]
         for solutionIdx in range(2, self.numSolutions):
-          solutionSerialIdx = problemIdx + solutionIdx
+          solutionSerialIdx = problemSerial + solutionIdx
           solutionGFlops = self.data[solutionSerialIdx]
           if solutionGFlops > winnerGFlops:
             #print "%f > %f" % (solutionGFlops, winnerGFlops)
@@ -1157,20 +553,20 @@ def leastImportantSolution(self):
       for size in problemSizes:
         totalFlops *= size
 
-      problemIdx = self.indicesToSerial(0, problemIndices)
-      if self.data[problemIdx+0] > self.data[problemIdx+1]:
+      problemSerial = self.indicesToSerial(0, problemIndices)
+      if self.data[problemSerial+0] > self.data[problemSerial+1]:
         winnerIdx = 0
-        winnerGFlops = self.data[problemIdx+0]
+        winnerGFlops = self.data[problemSerial+0]
         secondIdx = 1
-        secondGFlops = self.data[problemIdx+1]
+        secondGFlops = self.data[problemSerial+1]
       else:
         winnerIdx = 1
-        winnerGFlops = self.data[problemIdx+1]
+        winnerGFlops = self.data[problemSerial+1]
         secondIdx = 0
-        secondGFlops = self.data[problemIdx+0]
+        secondGFlops = self.data[problemSerial+0]
 
       for solutionIdx in range(2, self.numSolutions):
-        solutionSerialIdx = problemIdx + solutionIdx
+        solutionSerialIdx = problemSerial + solutionIdx
         solutionGFlops = self.data[solutionSerialIdx]
         if solutionGFlops > winnerGFlops:
           secondIdx = winnerIdx
@@ -1194,9 +590,29 @@ def leastImportantSolution(self):
 
 
   ##############################################################################
-  # Score Range For Logic
-  def scoreRangeForLogic(self, indexRange, logic):
-    pass
+  # Get Winner For Problem
+  def getWinnerForProblem(self, problemIndices):
+    problemSerial = self.indicesToSerial(0, problemIndices)
+    winnerIdx = -1
+    winnerGFlops = -1
+    for solutionIdx in range(0, self.numSolutions):
+      solutionSerialIdx = problemSerial + solutionIdx
+      solutionGFlops = self.data[solutionSerialIdx]
+      if solutionGFlops > winnerGFlops:
+        #print "%f > %f" % (solutionGFlops, winnerGFlops)
+        winnerIdx = solutionIdx
+        winnerGFlops = solutionGFlops
+    return (winnerIdx, winnerGFlops)
+
+  ##############################################################################
+  # Winner For Range
+  def winnerForRange(self, indexRange):
+    scores = self.scoreRangeForSolutions(indexRange)
+    winnerIdx = 0
+    for solutionIdx in range(1, self.numSolutions):
+      if scores[solutionIdx] < scores[winnerIdx]:
+        winnerIdx = solutionIdx
+    return winnerIdx
 
   ##############################################################################
   # Score (microseconds) Range For Solutions
@@ -1211,6 +627,89 @@ def scoreRangeForSolutions(self, indexRange):
         scores[solutionIdx] += timeUs
     return scores
 
+  ##############################################################################
+  # Score Range For Logic
+  def scoreRangeForLogic(self, indexRange, logic):
+    print "ScoreRangeForLogic", indexRange, logic
+    depth = self.getLogicDepth([logic])
+    depth = self.numIndices - depth
+    #obj = logic
+    #while isinstance(obj[0], list):
+    #  obj = obj[0][1]
+    #  depth -= 1
+    print "Depth:", depth
+    fullLogic = deepcopy(logic)
+    for i in range(0, depth):
+      #print "Logic:", fullLogic
+      fullLogic = [-1, [fullLogic]]
+    fullLogic = [fullLogic]
+    #print "FullLogic:", fullLogic
+    return self.scoreRangeForFullLogic(indexRange, fullLogic)
+
+  ##############################################################################
+  # Score Range For Full Logic
+  def scoreRangeForFullLogic(self, indexRange, logic):
+    print "ScoreRangeForFullLogic", indexRange, logic
+    score = 0
+    for problemIndices in self.problemIndicesForRange(indexRange):
+      problemSerial = self.indicesToSerial(0, problemIndices)
+      totalFlops = self.totalFlopsForProblemIndices(problemIndices)
+      solutionIdx = self.getSolutionForProblemIndicesUsingLogic( \
+          problemIndices, logic)
+      gflops = self.data[problemSerial + solutionIdx]
+      timeUs = totalFlops / gflops / 1000
+      score += timeUs
+    logicComplexity = [0]*self.numIndices
+    self.scoreLogicComplexity(logic, logicComplexity)
+    score += self.parameters["BranchWeight"] * sum(logicComplexity)
+    print "LogicComplexity:", logicComplexity
+    return score
+
+  ##############################################################################
+  # Get Solution For Problem Indices Using Logic
+  def getSolutionForProblemIndicesUsingLogic(self, problemIndices, logic):
+    currentProblemIndices = problemIndices
+    currentLogic = logic
+    for i in range(0, self.numIndices):
+      #print "CurrentLogic[%u]: %s" % (i, currentLogic)
+      currentSizeIndex = currentProblemIndices[0]
+      for j in range(0, len(currentLogic)):
+        if currentLogic[j][0] < 0:
+          currentProblemIndices = currentProblemIndices[1:]
+          currentLogic = currentLogic[j][1]
+          break
+        if currentLogic[j][0] >= 0:
+          if currentSizeIndex <= currentLogic[j][0]:
+            currentProblemIndices = currentProblemIndices[1:]
+            currentLogic = currentLogic[j][1]
+            break
+    #print "CurrentLogic[%u]: %s" % (i, currentLogic)
+    return currentLogic
+
+  ##############################################################################
+  # Score Logic Complexity
+  def scoreLogicComplexity(self, logic, logicComplexity):
+    print "ScoreLogicComplexity: %s" % (logic)
+    depth = self.getLogicDepth(logic)
+    depth = self.numIndices - depth
+    if depth == 0: return
+    #print "[%u]ScoreLogicComplexity: %s" % (depth, logic)
+    currentLogic = logic
+    for i in range(0, len(logic)):
+      logicComplexity[depth] += 1
+      self.scoreLogicComplexity(logic[i][1], logicComplexity)
+
+
+  ##############################################################################
+  # Get Logic Depth
+  def getLogicDepth(self, logic):
+    obj = logic
+    depth = 0
+    while isinstance(obj, list):
+      obj = obj[0][1]
+      depth += 1
+    return depth
+
   ##############################################################################
   # Total Flops For Problem Indices
   def totalFlopsForProblemIndices(self, problemIndices):
@@ -1247,12 +746,12 @@ def removeSolution(self, removeSolutionIdx):
     # update data
     self.totalSize = self.totalProblems * self.numSolutions
     self.data = array.array('f', [0]*self.totalSize)
-    for problemIdx in range(0, self.totalProblems):
+    for problemIndex in range(0, self.totalProblems):
       newSolutionIdx = 0
       for oldSolutionIdx in range(0, oldNumSolutions):
         if oldSolutionIdx != removeSolutionIdx:
-          self.data[problemIdx*self.numSolutions+newSolutionIdx] \
-              = oldData[problemIdx*oldNumSolutions+oldSolutionIdx]
+          self.data[problemIndex*self.numSolutions+newSolutionIdx] \
+              = oldData[problemIndex*oldNumSolutions+oldSolutionIdx]
           newSolutionIdx += 1
 
   ##############################################################################
@@ -1314,20 +813,6 @@ def problemIndicesForRange(self, indexRange):
           break
     return problemIndexList
 
-  ##############################################################################
-  # Get Winner For Problem
-  def getWinnerForProblem(self, problemIndices):
-    problemIdx = self.indicesToSerial(0, problemIndices)
-    winnerIdx = -1
-    winnerGFlops = -1
-    for solutionIdx in range(0, self.numSolutions):
-      solutionSerialIdx = problemIdx + solutionIdx
-      solutionGFlops = self.data[solutionSerialIdx]
-      if solutionGFlops > winnerGFlops:
-        #print "%f > %f" % (solutionGFlops, winnerGFlops)
-        winnerIdx = solutionIdx
-        winnerGFlops = solutionGFlops
-    return (winnerIdx, winnerGFlops)
 
 
   ##############################################################################

From dab3b3c542cbaef459c96effa442a81fd5c7e79b Mon Sep 17 00:00:00 2001
From: David Tanner <guacamoleo@gmail.com>
Date: Thu, 2 Mar 2017 11:41:08 -0600
Subject: [PATCH 14/21] logic complexity calculation working

---
 Tensile/LibraryLogic.py    | 424 ++++++++++++++++++++++---------------
 Tensile/SolutionStructs.py |   2 +-
 2 files changed, 253 insertions(+), 173 deletions(-)

diff --git a/Tensile/LibraryLogic.py b/Tensile/LibraryLogic.py
index 9dfeba30b..ae5b810e9 100644
--- a/Tensile/LibraryLogic.py
+++ b/Tensile/LibraryLogic.py
@@ -38,30 +38,39 @@ def analyzeProblemType( problemTypeTuple, inputParameters ):
 
   ######################################
   # Read Data From CSV
-  logic = LogicAnalyzer(problemType, problemSizes, solutions, inputParameters)
-  logic.populateFromCSV(dataFileName)
+  logicAnalyzer = LogicAnalyzer( \
+      problemType, problemSizes, solutions, inputParameters)
+  logicAnalyzer.populateFromCSV(dataFileName)
 
   ######################################
   # Remove invalid solutions
-  logic.removeInvalidSolutions()
+  logicAnalyzer.removeInvalidSolutions()
 
   ######################################
   # Remove least important solutions
-  logic.removeLeastImportantSolutions()
+  logicAnalyzer.removeLeastImportantSolutions()
 
   ######################################
   # Correct outliers
-  # logic.smooth()
-  logic.print2D([0, 0])
+  # logicAnalyzer.smooth()
+  logicAnalyzer.print2D([0, 0])
 
   ######################################
   # Create Rules
-  logic.enRule(0, logic.globalIndexRange)
+  logic = logicAnalyzer.enRule(0, logicAnalyzer.globalIndexRange)
+  print "Final Logic:"
+  print logic
+  logicComplexity = [0]*logicAnalyzer.numIndices
+  logicAnalyzer.scoreLogicComplexity(logic, logicComplexity)
+  print "Logic Complexity:", logicComplexity
+  score = logicAnalyzer.scoreRangeForLogic( \
+      logicAnalyzer.globalIndexRange, logic)
+  print "Global Score:", score
 
 
 
   #return (skinnyRules01, skinnyRules10, diagonalRules)
-  #return (problemType, logic.solutionsUsed, [], [], logic.diagonalRules )
+  #return (problemType, logicAnalyzer.solutionsUsed, [], [], logicAnalyzer.diagonalRules )
   return (problemType, [], [], [], [] )
 
 
@@ -173,6 +182,7 @@ def __init__(self, problemType, problemSizes, solutions, inputParameters):
       self.globalIndexRange.append([0, self.numProblemSizes[i]])
     self.problemIndicesForGlobalRange \
         = self.problemIndicesForRange(self.globalIndexRange)
+    self.tab = [""]*self.numIndices
 
 
 
@@ -334,76 +344,135 @@ def smooth(self):
   #
   ##############################################################################
   def enRule(self, currentIndexIndex, currentIndexRange):
-    tab = ""
-    for i in range(0, currentIndexIndex):
-      tab += "  "
-    print "%senRule(%u, %s)" % (tab, currentIndexIndex, currentIndexRange)
+    cii = currentIndexIndex
+    if currentIndexIndex == 0:
+      self.tab[cii] = "| "
+    elif currentIndexIndex == 1:
+      self.tab[cii] = "[%2u]-| " % ( \
+          currentIndexRange[self.indexOrder[0]][0])
+    elif currentIndexIndex == 2:
+      self.tab[cii] = "[%2u,%2u]--| " % ( \
+          currentIndexRange[self.indexOrder[0]][0], \
+          currentIndexRange[self.indexOrder[1]][0])
+    elif currentIndexIndex == 3:
+      self.tab[cii] = "[%2u,%2u,%2u]---| " % ( \
+          currentIndexRange[self.indexOrder[0]][0], \
+          currentIndexRange[self.indexOrder[1]][0], \
+          currentIndexRange[self.indexOrder[2]][0])
+    elif currentIndexIndex == 4:
+      self.tab[cii] = "[%2u,%2u,%2u,%2u]---| " % ( \
+          currentIndexRange[self.indexOrder[0]][0], \
+          currentIndexRange[self.indexOrder[1]][0], \
+          currentIndexRange[self.indexOrder[2]][0], \
+          currentIndexRange[self.indexOrder[3]][0])
+    tab = self.tab[cii]
     currentIndex = self.indexOrder[currentIndexIndex]
+    print "%senRule(%s)" % (tab, currentIndexRange)
     nextIndexIndex = currentIndexIndex+1
     nextIndexRange = deepcopy(currentIndexRange)
     isLastIndex = currentIndexIndex == self.numIndices-1
+    ruleList = []
 
+    ########################################
     # if there's only 1 problem size here
+    ########################################
     if currentIndexRange[currentIndex][1] \
         - currentIndexRange[currentIndex][0] == 1:
 
+      ########################################
       # this is last index, so just return fastest solution
       if isLastIndex:
-        # optimize b/c this should be only single problem
+        # TODO optimize b/c this should be only single problem
         #scores = self.scoreRangeForSolutions(currentIndexRange)
         #winnerIdx = 0
         #for solutionIdx in range(1, self.numSolution):
         #  if scores[solutionIdx] < scores[winnerIdx]:
         #    winnerIdx = solutionIdx
         winnerIdx = self.winnerForRange(currentIndexRange)
-        print "%s  returning early winner=%u" % (tab, winnerIdx)
-        return [ -1, winnerIdx ]
+        print "%sreturning early winner=%u" % (tab, winnerIdx)
+        ruleList.append(-1)
+        ruleList.append(winnerIdx)
 
-      # this isn't last index, so just return next index
+      ########################################
+      # this isn't last index, so just recursively return next index
       else:
-        print "%s  returning early enRule(%u,%s)" \
-            % (tab, nextIndexIndex, nextIndexRange)
-        return [ -1, self.enRule(nextIndexIndex, nextIndexRange) ]
-
-    # ruleList
-    ruleList = []
-
-    # create rule for smallest size
-    initialSize = min(currentIndexRange[currentIndex][0] \
-        + self.parameters["InitialSolutionWindow"], \
-        self.numProblemSizes[currentIndex])
-    nextIndexRange[currentIndex][1] = initialSize
-    if isLastIndex:
-      winnerIdx = self.winnerForRange(nextIndexRange)
-      initialRule = [ currentIndexRange[currentIndex][0], winnerIdx]
-
+        print "%sreturning early enRule(%s)" \
+            % (tab, nextIndexRange)
+        rule = [ -1, self.enRule(nextIndexIndex, nextIndexRange) ]
+        ruleList.append(rule)
+
+    ########################################
+    # full iterative rule list
+    ########################################
     else:
-      initialRule = [ currentIndexRange[currentIndex][0], \
-          self.enRule(nextIndexIndex, nextIndexRange) ]
-    ruleList.append(initialRule)
-
-    # for all problem indices in this index
-
-    for problemIndex in range(currentIndexRange[currentIndex][0], \
-        currentIndexRange[currentIndex][1]):
-      print "%s  pIdx: %u" % (tab, problemIndex)
-      nextIndexRange[currentIndex][0] = problemIndex
-      nextIndexRange[currentIndex][1] = problemIndex+1
 
+      ########################################
+      # create initial rule
+      initialSize = min(currentIndexRange[currentIndex][0] \
+          + self.parameters["InitialSolutionWindow"], \
+          self.numProblemSizes[currentIndex])
+      nextIndexRange[currentIndex][1] = initialSize
       if isLastIndex:
-        winnerIdx = self.winnerForRange(currentIndexRange)
-        candidateRule = [ currentIndexRange[currentIndex][0], winnerIdx]
+        winnerIdx = self.winnerForRange(nextIndexRange)
+        initialRule = [ currentIndexRange[currentIndex][0], winnerIdx]
       else:
-        candidateRule = [ problemIndex, self.enRule(nextIndexIndex, \
-            nextIndexRange) ]
-      priorRule = ruleList[len(ruleList)-1]
-      priorRuleScore = self.scoreRangeForLogic(nextIndexRange, priorRule)
-      candidateRuleScore = self.scoreRangeForLogic(nextIndexRange, \
-          candidateRule)
-      candidateRuleScore += self.parameters["BranchWeight"] # penalize
-      if candidateRuleScore < priorRuleScore:
-        ruleList.append(candidateRule)
+        print "%sinitialRule(%s)" % (tab, nextIndexRange)
+        initialRule = [ currentIndexRange[currentIndex][0], \
+            self.enRule(nextIndexIndex, nextIndexRange) ]
+        print "%sinitialRule(%s) DONE" % (tab, nextIndexRange)
+      ruleList.append(initialRule)
+
+      ########################################
+      # for all problem indices in this index
+      for problemIndex in range(currentIndexRange[currentIndex][0]+1, \
+          currentIndexRange[currentIndex][1]):
+        nextIndexRange[currentIndex][0] = problemIndex
+        nextIndexRange[currentIndex][1] = problemIndex+1
+        priorRule = ruleList[len(ruleList)-1]
+        priorRuleForSize = deepcopy(priorRule)
+        priorRuleForSize[0] = problemIndex
+
+        if isLastIndex:
+          winnerIdx = self.winnerForRange(nextIndexRange)
+          candidateRule = [ problemIndex, winnerIdx]
+        else:
+          candidateRule = [ problemIndex, self.enRule(nextIndexIndex, \
+              nextIndexRange) ]
+
+        ########################################
+        # candidate same as prior
+        if candidateRule[1] == priorRule[1]:
+          print "%sP[%2u]: same" % (tab, problemIndex)
+          ruleList[len(ruleList)-1][0] = problemIndex
+          continue
+
+        ########################################
+        # compare candidate vs prior
+        else:
+          print "%sScoring P:%s for Prior=%s, Cand=%s" \
+              % ( tab, nextIndexRange, priorRuleForSize, candidateRule)
+          priorRuleScore = self.scoreRangeForLogic(nextIndexRange, \
+              [priorRuleForSize])
+          candidateRuleScore = self.scoreRangeForLogic(nextIndexRange, \
+              [candidateRule])
+          candidateRuleScore += self.parameters["BranchWeight"] # penalize
+          candidateFaster = candidateRuleScore < priorRuleScore
+          print "%sP[%2u]: %s %s~%.0fus < %s~%.0fus" % (tab, problemIndex, \
+              "wins" if candidateFaster else "same", \
+              candidateRule, candidateRuleScore, priorRuleForSize, \
+              priorRuleScore )
+
+          ########################################
+          # candidate wins
+          if candidateRuleScore < priorRuleScore:
+            ruleList.append(candidateRule)
+
+          ########################################
+          # prior wins
+          else:
+            ruleList[len(ruleList)-1][0] = problemIndex
 
+    print "%sReturning RuleList: %s" % (tab, ruleList)
     return ruleList
 
 
@@ -417,20 +486,9 @@ def enRule(self, currentIndexIndex, currentIndexRange):
   ##############################################################################
 
 
-
-
-  ##############################################################################
-  ##############################################################################
-  ###
-  ###  Helper / Low-Level Functions
-  ###
-  ##############################################################################
-  ##############################################################################
-
-
-
   ##############################################################################
   # Print2D
+  ##############################################################################
   def print2D(self, indices ):
     indicesIdx = 0
     problemIndices = []
@@ -538,6 +596,7 @@ def print2D(self, indices ):
 
   ##############################################################################
   # Least Important Solution
+  ##############################################################################
   def leastImportantSolution(self):
     solutionImportance = []
     for i in range(0, self.numSolutions):
@@ -590,66 +649,70 @@ def leastImportantSolution(self):
 
 
   ##############################################################################
-  # Get Winner For Problem
-  def getWinnerForProblem(self, problemIndices):
-    problemSerial = self.indicesToSerial(0, problemIndices)
-    winnerIdx = -1
-    winnerGFlops = -1
-    for solutionIdx in range(0, self.numSolutions):
-      solutionSerialIdx = problemSerial + solutionIdx
-      solutionGFlops = self.data[solutionSerialIdx]
-      if solutionGFlops > winnerGFlops:
-        #print "%f > %f" % (solutionGFlops, winnerGFlops)
-        winnerIdx = solutionIdx
-        winnerGFlops = solutionGFlops
-    return (winnerIdx, winnerGFlops)
-
+  # Remove Solution
   ##############################################################################
-  # Winner For Range
-  def winnerForRange(self, indexRange):
-    scores = self.scoreRangeForSolutions(indexRange)
-    winnerIdx = 0
-    for solutionIdx in range(1, self.numSolutions):
-      if scores[solutionIdx] < scores[winnerIdx]:
-        winnerIdx = solutionIdx
-    return winnerIdx
+  def removeSolution(self, removeSolutionIdx):
+
+    # temporarily move current to old
+    oldSolutions = self.solutions
+    oldNumSolutions = self.numSolutions
+    oldData = self.data
+    oldTotalSize = self.totalSize
+
+    # update solutions
+    self.solutions = []
+    for i in range(0, oldNumSolutions):
+      if i != removeSolutionIdx:
+        self.solutions.append(oldSolutions[i])
+    self.solutionMinNaming = Solution.getMinNaming(self.solutions)
+    self.solutionNames = []
+    self.solutionTiles = []
+    for solution in self.solutions:
+      self.solutionNames.append(Solution.getNameMin(solution, \
+          self.solutionMinNaming))
+      self.solutionTiles.append("%ux%u"%(solution["MacroTile0"], \
+          solution["MacroTile1"]))
+    self.numSolutions = len(self.solutions)
+
+    # update data
+    self.totalSize = self.totalProblems * self.numSolutions
+    self.data = array.array('f', [0]*self.totalSize)
+    for problemIndex in range(0, self.totalProblems):
+      newSolutionIdx = 0
+      for oldSolutionIdx in range(0, oldNumSolutions):
+        if oldSolutionIdx != removeSolutionIdx:
+          self.data[problemIndex*self.numSolutions+newSolutionIdx] \
+              = oldData[problemIndex*oldNumSolutions+oldSolutionIdx]
+          newSolutionIdx += 1
 
-  ##############################################################################
-  # Score (microseconds) Range For Solutions
-  def scoreRangeForSolutions(self, indexRange):
-    scores = [0]*self.numSolutions
-    for problemIndices in self.problemIndicesForRange(indexRange):
-      problemSerial = self.indicesToSerial(0, problemIndices)
-      totalFlops = self.totalFlopsForProblemIndices(problemIndices)
-      for solutionIdx in range(0, self.numSolutions):
-        gflops = self.data[problemSerial+solutionIdx]
-        timeUs = totalFlops / gflops / 1000
-        scores[solutionIdx] += timeUs
-    return scores
 
   ##############################################################################
   # Score Range For Logic
+  ##############################################################################
   def scoreRangeForLogic(self, indexRange, logic):
-    print "ScoreRangeForLogic", indexRange, logic
-    depth = self.getLogicDepth([logic])
+    #print "ScoreRangeForLogic", indexRange, logic
+    depth = self.getLogicDepth(logic)
     depth = self.numIndices - depth
+    #print "%sSRFL R=%s L=%s" % (self.tab[depth], indexRange, logic)
     #obj = logic
     #while isinstance(obj[0], list):
     #  obj = obj[0][1]
     #  depth -= 1
-    print "Depth:", depth
+    #print "Depth:", depth
     fullLogic = deepcopy(logic)
     for i in range(0, depth):
       #print "Logic:", fullLogic
-      fullLogic = [-1, [fullLogic]]
-    fullLogic = [fullLogic]
+      fullLogic = [[-1, fullLogic]]
+    fullLogic = fullLogic
     #print "FullLogic:", fullLogic
-    return self.scoreRangeForFullLogic(indexRange, fullLogic)
+    return self.scoreRangeForFullLogic(depth, indexRange, fullLogic)
 
   ##############################################################################
   # Score Range For Full Logic
-  def scoreRangeForFullLogic(self, indexRange, logic):
-    print "ScoreRangeForFullLogic", indexRange, logic
+  ##############################################################################
+  def scoreRangeForFullLogic(self, depth, indexRange, logic):
+    #print "ScoreRangeForFullLogic", indexRange, logic
+    #print "%sSRFFL R=%s L=%s" % (self.tab[depth], indexRange, logic)
     score = 0
     for problemIndices in self.problemIndicesForRange(indexRange):
       problemSerial = self.indicesToSerial(0, problemIndices)
@@ -659,20 +722,25 @@ def scoreRangeForFullLogic(self, indexRange, logic):
       gflops = self.data[problemSerial + solutionIdx]
       timeUs = totalFlops / gflops / 1000
       score += timeUs
+      #print "%sSRFFL t+=%.0f" % (self.tab[depth], timeUs)
     logicComplexity = [0]*self.numIndices
     self.scoreLogicComplexity(logic, logicComplexity)
+    #print "%sSRFFL Complexity=%s" % (self.tab[depth], logicComplexity)
     score += self.parameters["BranchWeight"] * sum(logicComplexity)
-    print "LogicComplexity:", logicComplexity
+    #print "LogicComplexity:", logicComplexity
     return score
 
   ##############################################################################
   # Get Solution For Problem Indices Using Logic
+  ##############################################################################
   def getSolutionForProblemIndicesUsingLogic(self, problemIndices, logic):
-    currentProblemIndices = problemIndices
+    #print "i:", problemIndices
+    currentProblemIndices = self.toIndexOrder(problemIndices)
+    #print "i:", currentProblemIndices
     currentLogic = logic
     for i in range(0, self.numIndices):
-      #print "CurrentLogic[%u]: %s" % (i, currentLogic)
       currentSizeIndex = currentProblemIndices[0]
+      #print "CurrentLogic[%u] P[%2u]: %s" % (i, currentSizeIndex, currentLogic)
       for j in range(0, len(currentLogic)):
         if currentLogic[j][0] < 0:
           currentProblemIndices = currentProblemIndices[1:]
@@ -683,16 +751,67 @@ def getSolutionForProblemIndicesUsingLogic(self, problemIndices, logic):
             currentProblemIndices = currentProblemIndices[1:]
             currentLogic = currentLogic[j][1]
             break
-    #print "CurrentLogic[%u]: %s" % (i, currentLogic)
+    #print "FinalLogic[%u]: %s" % (i, currentLogic)
     return currentLogic
 
+
+  ##############################################################################
+  ##############################################################################
+  ###
+  ###  Helper / Low-Level Functions
+  ###
+  ##############################################################################
+  ##############################################################################
+
+
+  ##############################################################################
+  # Get Winner For Problem
+  def getWinnerForProblem(self, problemIndices):
+    problemSerial = self.indicesToSerial(0, problemIndices)
+    winnerIdx = -1
+    winnerGFlops = -1
+    for solutionIdx in range(0, self.numSolutions):
+      solutionSerialIdx = problemSerial + solutionIdx
+      solutionGFlops = self.data[solutionSerialIdx]
+      if solutionGFlops > winnerGFlops:
+        #print "%f > %f" % (solutionGFlops, winnerGFlops)
+        winnerIdx = solutionIdx
+        winnerGFlops = solutionGFlops
+    return (winnerIdx, winnerGFlops)
+
+
+  ##############################################################################
+  # Winner For Range
+  def winnerForRange(self, indexRange):
+    scores = self.scoreRangeForSolutions(indexRange)
+    winnerIdx = 0
+    for solutionIdx in range(1, self.numSolutions):
+      if scores[solutionIdx] < scores[winnerIdx]:
+        winnerIdx = solutionIdx
+    return winnerIdx
+
+
+  ##############################################################################
+  # Score (microseconds) Range For Solutions
+  def scoreRangeForSolutions(self, indexRange):
+    scores = [0]*self.numSolutions
+    for problemIndices in self.problemIndicesForRange(indexRange):
+      problemSerial = self.indicesToSerial(0, problemIndices)
+      totalFlops = self.totalFlopsForProblemIndices(problemIndices)
+      for solutionIdx in range(0, self.numSolutions):
+        gflops = self.data[problemSerial+solutionIdx]
+        timeUs = totalFlops / gflops / 1000
+        scores[solutionIdx] += timeUs
+    return scores
+
+
   ##############################################################################
   # Score Logic Complexity
   def scoreLogicComplexity(self, logic, logicComplexity):
-    print "ScoreLogicComplexity: %s" % (logic)
     depth = self.getLogicDepth(logic)
-    depth = self.numIndices - depth
     if depth == 0: return
+    depth = self.numIndices - depth
+    #print "ScoreLogicComplexity[%u]: %s" % (depth, logic)
     #print "[%u]ScoreLogicComplexity: %s" % (depth, logic)
     currentLogic = logic
     for i in range(0, len(logic)):
@@ -710,6 +829,21 @@ def getLogicDepth(self, logic):
       depth += 1
     return depth
 
+  ##############################################################################
+  # To Index Order
+  def toIndexOrder(self, problemIndices):
+    ordered = []
+    for i in self.indexOrder:
+      ordered.append(problemIndices[i])
+    return ordered
+# serial order = 0, 1, 2, 3
+# problem indi = 9, 8, 7, 6
+
+# index  order = 3, 2, 0, 1
+# ordered      = 6, 7, 9, 8
+#
+#
+
   ##############################################################################
   # Total Flops For Problem Indices
   def totalFlopsForProblemIndices(self, problemIndices):
@@ -718,41 +852,6 @@ def totalFlopsForProblemIndices(self, problemIndices):
       totalFlops *= self.problemIndexToSize[i][problemIndices[i]]
     return totalFlops
 
-  ##############################################################################
-  # Remove Solution
-  def removeSolution(self, removeSolutionIdx):
-
-    # temporarily move current to old
-    oldSolutions = self.solutions
-    oldNumSolutions = self.numSolutions
-    oldData = self.data
-    oldTotalSize = self.totalSize
-
-    # update solutions
-    self.solutions = []
-    for i in range(0, oldNumSolutions):
-      if i != removeSolutionIdx:
-        self.solutions.append(oldSolutions[i])
-    self.solutionMinNaming = Solution.getMinNaming(self.solutions)
-    self.solutionNames = []
-    self.solutionTiles = []
-    for solution in self.solutions:
-      self.solutionNames.append(Solution.getNameMin(solution, \
-          self.solutionMinNaming))
-      self.solutionTiles.append("%ux%u"%(solution["MacroTile0"], \
-          solution["MacroTile1"]))
-    self.numSolutions = len(self.solutions)
-
-    # update data
-    self.totalSize = self.totalProblems * self.numSolutions
-    self.data = array.array('f', [0]*self.totalSize)
-    for problemIndex in range(0, self.totalProblems):
-      newSolutionIdx = 0
-      for oldSolutionIdx in range(0, oldNumSolutions):
-        if oldSolutionIdx != removeSolutionIdx:
-          self.data[problemIndex*self.numSolutions+newSolutionIdx] \
-              = oldData[problemIndex*oldNumSolutions+oldSolutionIdx]
-          newSolutionIdx += 1
 
   ##############################################################################
   # Recommended Index Order
@@ -767,28 +866,6 @@ def recommendedIndexOrder(self):
     order.append(self.idx1)
     return order
 
-  ##############################################################################
-  # Print Data
-  def printData(self):
-    print2("serial; idxD0, idxD1, idxDU, idxOthers; sizeD0, sizeD1, sizeDU, sizeOthers; sol0, sol1, sol2, ...")
-    indices = [0]*self.numIndices
-    for serial in range(0, self.totalProblems):
-      s = "[%4u] [%2u" % (serial, indices[0])
-      for i in range(1, self.numIndices):
-        s += ", %2u" % indices[i]
-      s += "] [%4u" % self.problemIndexToSize[0][indices[0]]
-      for i in range(1, self.numIndices):
-        s += ", %4u" % self.problemIndexToSize[i][indices[i]]
-      s += "]: %9.3f" % self.data[serial*self.numSolutions+0]
-      for i in range(1, self.numSolutions):
-        s += ", %9.3f" % self.data[serial*self.numSolutions+i]
-      print2(s)
-      indices[0] += 1
-      for i in range(1, self.numIndices):
-        if indices[i-1] >= self.numProblemSizes[i-1]:
-          indices[i-1] = 0
-          indices[i] += 1
-
   ##############################################################################
   # Problem Indices For Range
   def problemIndicesForRange(self, indexRange):
@@ -814,7 +891,6 @@ def problemIndicesForRange(self, indexRange):
     return problemIndexList
 
 
-
   ##############################################################################
   # Get Size Free
   def getSizeFree(self, problemIndices):
@@ -929,3 +1005,7 @@ def main(  config ):
         schedulePrefix, logic)
 
   popWorkingPath()
+
+########################################
+# TODO
+# - is scoring working
diff --git a/Tensile/SolutionStructs.py b/Tensile/SolutionStructs.py
index 795c32022..e3d66be05 100644
--- a/Tensile/SolutionStructs.py
+++ b/Tensile/SolutionStructs.py
@@ -124,7 +124,7 @@ def numRegisters( self ):
   def numBytes( self ):
     return self.numRegisters() * 4
   def flopsPerMac(self):
-    return 1 if self.isReal() else 8
+    return 2 if self.isReal() else 8
 
   def __str__(self):
     return self.toChar()

From 6d68b297709b7a414e903646702f9aab9cfd142b Mon Sep 17 00:00:00 2001
From: David Tanner <guacamoleo@gmail.com>
Date: Thu, 2 Mar 2017 11:58:38 -0600
Subject: [PATCH 15/21] increasing branch penalty does decrease number of
 branches and worsen score

---
 Tensile/LibraryLogic.py | 18 +++++++++++++++---
 1 file changed, 15 insertions(+), 3 deletions(-)

diff --git a/Tensile/LibraryLogic.py b/Tensile/LibraryLogic.py
index ae5b810e9..0ab30ab9b 100644
--- a/Tensile/LibraryLogic.py
+++ b/Tensile/LibraryLogic.py
@@ -451,10 +451,22 @@ def enRule(self, currentIndexIndex, currentIndexRange):
         else:
           print "%sScoring P:%s for Prior=%s, Cand=%s" \
               % ( tab, nextIndexRange, priorRuleForSize, candidateRule)
+          # score prior
           priorRuleScore = self.scoreRangeForLogic(nextIndexRange, \
               [priorRuleForSize])
+          logicComplexity = [0]*self.numIndices
+          self.scoreLogicComplexity( \
+              [priorRuleForSize], logicComplexity)
+          priorRuleScore += self.parameters["BranchWeight"] \
+              * sum(logicComplexity)
+          # score candidate
           candidateRuleScore = self.scoreRangeForLogic(nextIndexRange, \
               [candidateRule])
+          logicComplexity = [0]*self.numIndices
+          self.scoreLogicComplexity( \
+              [candidateRule], logicComplexity)
+          candidateRuleScore += self.parameters["BranchWeight"] \
+              * sum(logicComplexity)
           candidateRuleScore += self.parameters["BranchWeight"] # penalize
           candidateFaster = candidateRuleScore < priorRuleScore
           print "%sP[%2u]: %s %s~%.0fus < %s~%.0fus" % (tab, problemIndex, \
@@ -723,10 +735,10 @@ def scoreRangeForFullLogic(self, depth, indexRange, logic):
       timeUs = totalFlops / gflops / 1000
       score += timeUs
       #print "%sSRFFL t+=%.0f" % (self.tab[depth], timeUs)
-    logicComplexity = [0]*self.numIndices
-    self.scoreLogicComplexity(logic, logicComplexity)
+    #logicComplexity = [0]*self.numIndices
+    #self.scoreLogicComplexity(logic, logicComplexity)
     #print "%sSRFFL Complexity=%s" % (self.tab[depth], logicComplexity)
-    score += self.parameters["BranchWeight"] * sum(logicComplexity)
+    #score += self.parameters["BranchWeight"] * sum(logicComplexity)
     #print "LogicComplexity:", logicComplexity
     return score
 

From af066ce9a1bf749edf2d6c1c5e0e63fd47730868 Mon Sep 17 00:00:00 2001
From: David Tanner <guacamoleo@gmail.com>
Date: Thu, 2 Mar 2017 16:21:13 -0600
Subject: [PATCH 16/21] writing new library code for new solution selection
 logic

---
 Tensile/ClientWriter.py         |   9 ++-
 Tensile/LibraryLogic.py         | 128 ++++++++++++--------------------
 Tensile/TensileCreateLibrary.py |  79 ++++++++++++++++----
 Tensile/YAMLIO.py               |  56 ++++++++++----
 4 files changed, 155 insertions(+), 117 deletions(-)

diff --git a/Tensile/ClientWriter.py b/Tensile/ClientWriter.py
index f3289dcc8..2ee9ba083 100644
--- a/Tensile/ClientWriter.py
+++ b/Tensile/ClientWriter.py
@@ -1,4 +1,4 @@
-from Common import globalParameters, HR, pushWorkingPath, popWorkingPath, print1, CHeader, printExit
+from Common import globalParameters, HR, pushWorkingPath, popWorkingPath, print1, CHeader, printExit, printWarning
 from SolutionStructs import Solution
 from SolutionWriter import SolutionWriter
 import YAMLIO
@@ -54,13 +54,14 @@ def main( config ):
   ##############################################################################
   logicFiles = [os.path.join(libraryLogicPath, f) for f \
       in os.listdir(libraryLogicPath) \
-      if os.path.isfile(os.path.join(libraryLogicPath, f))]
+      if (os.path.isfile(os.path.join(libraryLogicPath, f)) \
+      and os.path.splitext(f)[1]==".yaml")]
   print1("LogicFiles: %s" % logicFiles)
   functions = []
   functionNames = []
   for logicFileName in logicFiles:
-    (scheduleName, problemType, solutionsForType, skinnyLogic0, skinnyLogic1, \
-        diagonalLogic) = YAMLIO.readLibraryLogicForProblemType(logicFileName)
+    (scheduleName, problemType, solutionsForType, indexOrder, logic) \
+        = YAMLIO.readLibraryLogicForProblemType(logicFileName)
     functions.append((scheduleName, problemType))
     functionNames.append("tensile_%s_%s" % (scheduleName, problemType))
 
diff --git a/Tensile/LibraryLogic.py b/Tensile/LibraryLogic.py
index 0ab30ab9b..3016f1c79 100644
--- a/Tensile/LibraryLogic.py
+++ b/Tensile/LibraryLogic.py
@@ -2,6 +2,7 @@
 import os.path
 import array
 import csv
+from sys import stdout
 
 from copy import deepcopy
 
@@ -20,9 +21,6 @@ def analyzeProblemType( problemTypeTuple, inputParameters ):
   print2(HR)
   print1("# %s" % problemType)
 
-  #print "#  %s" % dataFileName
-  #print "#  %s" % solutionsFileName
-
   ######################################
   # Read Solutions
   (problemSizes, solutions) = YAMLIO.readSolutions(solutionsFileName)
@@ -58,20 +56,18 @@ def analyzeProblemType( problemTypeTuple, inputParameters ):
   ######################################
   # Create Rules
   logic = logicAnalyzer.enRule(0, logicAnalyzer.globalIndexRange)
-  print "Final Logic:"
-  print logic
+  print2("# Final Logic:")
+  print2(logic)
   logicComplexity = [0]*logicAnalyzer.numIndices
   logicAnalyzer.scoreLogicComplexity(logic, logicComplexity)
-  print "Logic Complexity:", logicComplexity
+  print2("Logic Complexity: %s" % logicComplexity)
   score = logicAnalyzer.scoreRangeForLogic( \
       logicAnalyzer.globalIndexRange, logic)
-  print "Global Score:", score
-
-
+  print1("\nScore: %.0f ms" % (score/1000))
 
   #return (skinnyRules01, skinnyRules10, diagonalRules)
   #return (problemType, logicAnalyzer.solutionsUsed, [], [], logicAnalyzer.diagonalRules )
-  return (problemType, [], [], [], [] )
+  return (problemType, logicAnalyzer.solutions, logicAnalyzer.indexOrder, logic)
 
 
 
@@ -80,20 +76,6 @@ def analyzeProblemType( problemTypeTuple, inputParameters ):
 ################################################################################
 class LogicAnalyzer:
 
-  ########################################
-  # diagonal rule looks like
-  # 0: solutionIdx
-  # 1: problemIndices for minThreshold problem
-  # 2: gflops at above minSize
-  # 3: maxGFlops for this solution along diagonal in interval it won
-  # 4: gflops of prior winner at minSize, i.e., what performance did it beat
-
-  ########################################
-  # skinny rule looks like
-  # 0: solutionIdx
-  # 1: problemIndices for minThreshold problem
-  # 2: gflops at above minSize
-
   ##############################################################################
   ##############################################################################
   ###
@@ -173,8 +155,6 @@ def __init__(self, problemType, problemSizes, solutions, inputParameters):
     self.w0 = self.parameters["Weight0"]
     self.w1 = self.parameters["Weight1"]
     self.w2 = self.parameters["Weight2"]
-    #print "S->I %s" % self.problemSizeToIndex
-    #print "I->S %s" % self.problemIndexToSize
     self.indexOrder = self.recommendedIndexOrder()
     print2("IndexOrder: %s" % self.indexOrder)
     self.globalIndexRange = []
@@ -224,10 +204,6 @@ def populateFromCSV(self, dataFileName):
         for i in range(0, self.numIndices):
           problemIndices.append(self.problemSizeToIndex[i][problemSize[i]])
         serialIdx = self.indicesToSerial(0, problemIndices)
-        #print "%s -> %s -> %u" % (problemSize, problemIndices, serialIdx)
-
-        # total size
-        #totalFlops = float(row[totalSizeIdx])
 
         # data
         solutionIdx = 0
@@ -316,7 +292,6 @@ def smooth(self):
           new = sum(neighborGFlops)/len(neighborGFlops)
           old = self.data[problemSerial+solutionIdx]
           s += "%f -> %f" % (old, new)
-          print s
           self.data[problemSerial+solutionIdx] \
               = sum(neighborGFlops)/len(neighborGFlops)
 
@@ -346,28 +321,30 @@ def smooth(self):
   def enRule(self, currentIndexIndex, currentIndexRange):
     cii = currentIndexIndex
     if currentIndexIndex == 0:
-      self.tab[cii] = "| "
+      self.tab[cii] = "[] "
     elif currentIndexIndex == 1:
-      self.tab[cii] = "[%2u]-| " % ( \
+      self.tab[cii] = "[%2u] " % ( \
           currentIndexRange[self.indexOrder[0]][0])
     elif currentIndexIndex == 2:
-      self.tab[cii] = "[%2u,%2u]--| " % ( \
+      self.tab[cii] = "[%2u,%2u] " % ( \
           currentIndexRange[self.indexOrder[0]][0], \
           currentIndexRange[self.indexOrder[1]][0])
     elif currentIndexIndex == 3:
-      self.tab[cii] = "[%2u,%2u,%2u]---| " % ( \
+      self.tab[cii] = "[%2u,%2u,%2u] " % ( \
           currentIndexRange[self.indexOrder[0]][0], \
           currentIndexRange[self.indexOrder[1]][0], \
           currentIndexRange[self.indexOrder[2]][0])
     elif currentIndexIndex == 4:
-      self.tab[cii] = "[%2u,%2u,%2u,%2u]---| " % ( \
+      self.tab[cii] = "[%2u,%2u,%2u,%2u] " % ( \
           currentIndexRange[self.indexOrder[0]][0], \
           currentIndexRange[self.indexOrder[1]][0], \
           currentIndexRange[self.indexOrder[2]][0], \
           currentIndexRange[self.indexOrder[3]][0])
     tab = self.tab[cii]
+    if globalParameters["PrintLevel"] == 1:
+      stdout.write("\n%s"%tab)
     currentIndex = self.indexOrder[currentIndexIndex]
-    print "%senRule(%s)" % (tab, currentIndexRange)
+    print2("%senRule(%s)" % (tab, currentIndexRange))
     nextIndexIndex = currentIndexIndex+1
     nextIndexRange = deepcopy(currentIndexRange)
     isLastIndex = currentIndexIndex == self.numIndices-1
@@ -389,17 +366,21 @@ def enRule(self, currentIndexIndex, currentIndexRange):
         #  if scores[solutionIdx] < scores[winnerIdx]:
         #    winnerIdx = solutionIdx
         winnerIdx = self.winnerForRange(currentIndexRange)
-        print "%sreturning early winner=%u" % (tab, winnerIdx)
+        #print2("%sreturning early winner=%u" % (tab, winnerIdx))
         ruleList.append(-1)
         ruleList.append(winnerIdx)
+        if globalParameters["PrintLevel"] == 1:
+          stdout.write("#")
 
       ########################################
       # this isn't last index, so just recursively return next index
       else:
-        print "%sreturning early enRule(%s)" \
-            % (tab, nextIndexRange)
+        #print2("%sreturning early enRule(%s)" \
+        #    % (tab, nextIndexRange) )
         rule = [ -1, self.enRule(nextIndexIndex, nextIndexRange) ]
         ruleList.append(rule)
+        if globalParameters["PrintLevel"] == 1:
+          stdout.write("#")
 
     ########################################
     # full iterative rule list
@@ -416,11 +397,13 @@ def enRule(self, currentIndexIndex, currentIndexRange):
         winnerIdx = self.winnerForRange(nextIndexRange)
         initialRule = [ currentIndexRange[currentIndex][0], winnerIdx]
       else:
-        print "%sinitialRule(%s)" % (tab, nextIndexRange)
+        #print2("%sinitialRule(%s)" % (tab, nextIndexRange))
         initialRule = [ currentIndexRange[currentIndex][0], \
             self.enRule(nextIndexIndex, nextIndexRange) ]
-        print "%sinitialRule(%s) DONE" % (tab, nextIndexRange)
+        #print2("%sinitialRule(%s) DONE" % (tab, nextIndexRange))
       ruleList.append(initialRule)
+      if globalParameters["PrintLevel"] == 1:
+        stdout.write("#")
 
       ########################################
       # for all problem indices in this index
@@ -442,15 +425,17 @@ def enRule(self, currentIndexIndex, currentIndexRange):
         ########################################
         # candidate same as prior
         if candidateRule[1] == priorRule[1]:
-          print "%sP[%2u]: same" % (tab, problemIndex)
+          #print2("%sP[%2u]: same" % (tab, problemIndex))
           ruleList[len(ruleList)-1][0] = problemIndex
+          if globalParameters["PrintLevel"] == 1:
+            stdout.write(" ")
           continue
 
         ########################################
         # compare candidate vs prior
         else:
-          print "%sScoring P:%s for Prior=%s, Cand=%s" \
-              % ( tab, nextIndexRange, priorRuleForSize, candidateRule)
+          #print2("%sScoring P:%s for Prior=%s, Cand=%s" \
+          #    % ( tab, nextIndexRange, priorRuleForSize, candidateRule))
           # score prior
           priorRuleScore = self.scoreRangeForLogic(nextIndexRange, \
               [priorRuleForSize])
@@ -469,22 +454,26 @@ def enRule(self, currentIndexIndex, currentIndexRange):
               * sum(logicComplexity)
           candidateRuleScore += self.parameters["BranchWeight"] # penalize
           candidateFaster = candidateRuleScore < priorRuleScore
-          print "%sP[%2u]: %s %s~%.0fus < %s~%.0fus" % (tab, problemIndex, \
+          print2("%sP[%2u]: %s %s~%.0fus < %s~%.0fus" % (tab, problemIndex, \
               "wins" if candidateFaster else "same", \
               candidateRule, candidateRuleScore, priorRuleForSize, \
-              priorRuleScore )
+              priorRuleScore ))
 
           ########################################
           # candidate wins
           if candidateRuleScore < priorRuleScore:
             ruleList.append(candidateRule)
+            if globalParameters["PrintLevel"] == 1:
+              stdout.write("#")
 
           ########################################
           # prior wins
           else:
+            if globalParameters["PrintLevel"] == 1:
+              stdout.write(".")
             ruleList[len(ruleList)-1][0] = problemIndex
 
-    print "%sReturning RuleList: %s" % (tab, ruleList)
+    #print2("%sReturning RuleList: %s" % (tab, ruleList))
     return ruleList
 
 
@@ -562,7 +551,6 @@ def print2D(self, indices ):
           solutionSerialIdx = problemSerial + solutionIdx
           solutionGFlops = self.data[solutionSerialIdx]
           if solutionGFlops > winnerGFlops:
-            #print "%f > %f" % (solutionGFlops, winnerGFlops)
             secondIdx = winnerIdx
             secondGFlops = winnerGFlops
             winnerIdx = solutionIdx
@@ -702,29 +690,18 @@ def removeSolution(self, removeSolutionIdx):
   # Score Range For Logic
   ##############################################################################
   def scoreRangeForLogic(self, indexRange, logic):
-    #print "ScoreRangeForLogic", indexRange, logic
     depth = self.getLogicDepth(logic)
     depth = self.numIndices - depth
-    #print "%sSRFL R=%s L=%s" % (self.tab[depth], indexRange, logic)
-    #obj = logic
-    #while isinstance(obj[0], list):
-    #  obj = obj[0][1]
-    #  depth -= 1
-    #print "Depth:", depth
     fullLogic = deepcopy(logic)
     for i in range(0, depth):
-      #print "Logic:", fullLogic
       fullLogic = [[-1, fullLogic]]
     fullLogic = fullLogic
-    #print "FullLogic:", fullLogic
     return self.scoreRangeForFullLogic(depth, indexRange, fullLogic)
 
   ##############################################################################
   # Score Range For Full Logic
   ##############################################################################
   def scoreRangeForFullLogic(self, depth, indexRange, logic):
-    #print "ScoreRangeForFullLogic", indexRange, logic
-    #print "%sSRFFL R=%s L=%s" % (self.tab[depth], indexRange, logic)
     score = 0
     for problemIndices in self.problemIndicesForRange(indexRange):
       problemSerial = self.indicesToSerial(0, problemIndices)
@@ -734,25 +711,16 @@ def scoreRangeForFullLogic(self, depth, indexRange, logic):
       gflops = self.data[problemSerial + solutionIdx]
       timeUs = totalFlops / gflops / 1000
       score += timeUs
-      #print "%sSRFFL t+=%.0f" % (self.tab[depth], timeUs)
-    #logicComplexity = [0]*self.numIndices
-    #self.scoreLogicComplexity(logic, logicComplexity)
-    #print "%sSRFFL Complexity=%s" % (self.tab[depth], logicComplexity)
-    #score += self.parameters["BranchWeight"] * sum(logicComplexity)
-    #print "LogicComplexity:", logicComplexity
     return score
 
   ##############################################################################
   # Get Solution For Problem Indices Using Logic
   ##############################################################################
   def getSolutionForProblemIndicesUsingLogic(self, problemIndices, logic):
-    #print "i:", problemIndices
     currentProblemIndices = self.toIndexOrder(problemIndices)
-    #print "i:", currentProblemIndices
     currentLogic = logic
     for i in range(0, self.numIndices):
       currentSizeIndex = currentProblemIndices[0]
-      #print "CurrentLogic[%u] P[%2u]: %s" % (i, currentSizeIndex, currentLogic)
       for j in range(0, len(currentLogic)):
         if currentLogic[j][0] < 0:
           currentProblemIndices = currentProblemIndices[1:]
@@ -763,7 +731,6 @@ def getSolutionForProblemIndicesUsingLogic(self, problemIndices, logic):
             currentProblemIndices = currentProblemIndices[1:]
             currentLogic = currentLogic[j][1]
             break
-    #print "FinalLogic[%u]: %s" % (i, currentLogic)
     return currentLogic
 
 
@@ -786,7 +753,6 @@ def getWinnerForProblem(self, problemIndices):
       solutionSerialIdx = problemSerial + solutionIdx
       solutionGFlops = self.data[solutionSerialIdx]
       if solutionGFlops > winnerGFlops:
-        #print "%f > %f" % (solutionGFlops, winnerGFlops)
         winnerIdx = solutionIdx
         winnerGFlops = solutionGFlops
     return (winnerIdx, winnerGFlops)
@@ -823,8 +789,6 @@ def scoreLogicComplexity(self, logic, logicComplexity):
     depth = self.getLogicDepth(logic)
     if depth == 0: return
     depth = self.numIndices - depth
-    #print "ScoreLogicComplexity[%u]: %s" % (depth, logic)
-    #print "[%u]ScoreLogicComplexity: %s" % (depth, logic)
     currentLogic = logic
     for i in range(0, len(logic)):
       logicComplexity[depth] += 1
@@ -841,6 +805,7 @@ def getLogicDepth(self, logic):
       depth += 1
     return depth
 
+
   ##############################################################################
   # To Index Order
   def toIndexOrder(self, problemIndices):
@@ -848,13 +813,7 @@ def toIndexOrder(self, problemIndices):
     for i in self.indexOrder:
       ordered.append(problemIndices[i])
     return ordered
-# serial order = 0, 1, 2, 3
-# problem indi = 9, 8, 7, 6
 
-# index  order = 3, 2, 0, 1
-# ordered      = 6, 7, 9, 8
-#
-#
 
   ##############################################################################
   # Total Flops For Problem Indices
@@ -1012,12 +971,17 @@ def main(  config ):
   # Run Analysis
   schedulePrefix = globalParameters["Name"]
   for problemTypeTuple in problemTypeTuples:
-    logic = analyzeProblemType( problemTypeTuple, analysisParameters )
+    logicTuple = analyzeProblemType( problemTypeTuple, analysisParameters )
     YAMLIO.writeLibraryLogicForProblemType(globalParameters["WorkingPath"], \
-        schedulePrefix, logic)
+        schedulePrefix, logicTuple)
 
   popWorkingPath()
 
 ########################################
 # TODO
-# - is scoring working
+# - different weights for different levels?
+#   are there pairs of weights that would result in same logic complexity but better score?
+
+########################################
+# TODO problems which this algorithm
+# - barrier to switching may not always be amortised on next step, need to calculate several steps into future to see if net win; process needs to be a search tree. 32x32 search only takes 1 second
diff --git a/Tensile/TensileCreateLibrary.py b/Tensile/TensileCreateLibrary.py
index ef7c13392..a17366852 100644
--- a/Tensile/TensileCreateLibrary.py
+++ b/Tensile/TensileCreateLibrary.py
@@ -171,9 +171,8 @@ def writeLogic(outputPath, logicList, solutionWriter ):
     scheduleName = logicProblemType[0]
     problemType = logicProblemType[1]
     solutions = logicProblemType[2]
-    skinnyLogic0 = logicProblemType[3]
-    skinnyLogic1 = logicProblemType[4]
-    diagonalLogic = logicProblemType[5]
+    indexOrder = logicProblemType[3]
+    logic = logicProblemType[4]
 
     # solution names
     solutionNames = []
@@ -218,14 +217,22 @@ def writeLogic(outputPath, logicList, solutionWriter ):
     for i in range(1, len(problemType["IndicesSummation"])):
       s += "*size%s" % indexChars[problemType["IndicesSummation"][i]]
     s += ";\n\n"
-    for rule in skinnyLogic0:
-      print2(rule)
-    for rule in skinnyLogic1:
-      print2(rule)
+    #for rule in skinnyLogic0:
+    #  print2(rule)
+    #for rule in skinnyLogic1:
+    #  print2(rule)
     print2(solutionNames)
 
-    for ruleIdx in range(0, len(diagonalLogic)):
-      rule = diagonalLogic[ruleIdx]
+    logicStr = writeLogicRec(0, indexOrder, logic, solutionNames)
+    print logicStr
+    printExit("TODO")
+    #for indexIndex in range(0, problemType["TotalIndices"]):
+    #  index = indexOrder[indexIndex]
+    #  for ruleIdx in range(0, len(logic)):
+    #    rule = logic[ruleIdx]
+
+
+    """
       print2(rule)
       winnerIdx = rule[0]
       problemSize = rule[1]
@@ -267,7 +274,7 @@ def writeLogic(outputPath, logicList, solutionWriter ):
         s += ", size%s" % indexChars[i]
       s += ", stream, numInputEvents, inputEvents, outputEvent ); /* [%f,%f] GFlops*/\n" % (minGFlops,maxGFlops)
 
-
+    """
     s += "\n}\n"
 
     # open and close individual files
@@ -289,6 +296,45 @@ def writeLogic(outputPath, logicList, solutionWriter ):
   logicHeaderFile.write(h)
   logicHeaderFile.close()
 
+################################################################################
+# Write Logic Recursive
+################################################################################
+def writeLogicRec(depth, indexOrder, logic, solutionNames):
+  indexChars = globalParameters["IndexChars"]
+  indent = "  "
+  indent += "  "*depth
+  s = ""
+  lowestLevel = depth == len(indexOrder)-1
+  numRules = len(logic)
+  if numRules > 1:
+    # multiple rules, need if/else
+    for ruleIdx in range(0, numRules):
+      rule = logic[ruleIdx]
+      threshold = rule[0]
+      if lowestLevel:
+        solutionIdx = rule[1]
+        s += "%sif (size%s < %u) return solution[%u];\n" \
+            % (indent, indexChars[indexOrder[depth]], threshold, solutionIdx)
+      else:
+        s += "%sif (size%s < %u) {\n" \
+            % (indent, indexChars[indexOrder[depth]], threshold)
+        s += writeLogicRec(depth+1, indexOrder, rule[1], solutionNames)
+        s += "%s}\n" % (indent)
+  else:
+    ruleIdx = 0
+    rule = logic[ruleIdx]
+    threshold = rule[0]
+    if lowestLevel:
+      solutionIdx = rule[1]
+      s += "%sreturn solution[%u];\n" \
+          % (indent, solutionIdx)
+    else:
+      s += "%s{\n" \
+          % (indent)
+      s += writeLogicRec(depth+1, indexOrder, rule[1], solutionNames)
+      s += "%s}\n" % (indent)
+  return s
+
 
 ################################################################################
 # Write CMake
@@ -412,11 +458,12 @@ def TensileCreateLibrary():
     printExit("LogicPath %s doesn't exist" % logicPath)
 
   logicFiles = [os.path.join(logicPath, f) for f in os.listdir(logicPath) \
-      if os.path.isfile(os.path.join(logicPath, f))]
+      if (os.path.isfile(os.path.join(logicPath, f)) \
+      and os.path.splitext(f)[1]==".yaml")]
 
-  print2("# LibraryLogicFiles:" % logicFiles)
+  print1("# LibraryLogicFiles:" % logicFiles)
   for logicFile in logicFiles:
-    print2("#   %s" % logicFile)
+    print1("#   %s" % logicFile)
 
   ##############################################################################
   # Parse config files
@@ -424,10 +471,10 @@ def TensileCreateLibrary():
   solutions = []
   logicList = []
   for logicFileName in logicFiles:
-    (scheduleName, problemType, solutionsForType, skinnyLogic0, skinnyLogic1, \
-        diagonalLogic) = YAMLIO.readLibraryLogicForProblemType(logicFileName)
+    (scheduleName, problemType, solutionsForType, indexOrder, logic) \
+        = YAMLIO.readLibraryLogicForProblemType(logicFileName)
     logicList.append((scheduleName, problemType, solutionsForType, \
-        skinnyLogic0, skinnyLogic1, diagonalLogic))
+        indexOrder, logic ))
     for solution in solutionsForType:
       if solution not in solutions:
         solutions.append(solution)
diff --git a/Tensile/YAMLIO.py b/Tensile/YAMLIO.py
index c0f7393b3..a806bd54e 100644
--- a/Tensile/YAMLIO.py
+++ b/Tensile/YAMLIO.py
@@ -85,27 +85,52 @@ def readSolutions( filename ):
 # 1 yaml per problem type
 # problemType, skinny0, skinny1, diagonal
 ################################################################################
-def writeLibraryLogicForProblemType( filePath, schedulePrefix, logic):
-  problemType   = logic[0]
-  solutions     = logic[1]
-  skinnyLogic0  = logic[2]
-  skinnyLogic1  = logic[3]
-  diagonalLogic = logic[4]
+def writeLibraryLogicForProblemType( filePath, schedulePrefix, logicTuple):
+  problemType   = logicTuple[0]
+  solutions     = logicTuple[1]
+  indexOrder    = logicTuple[2]
+  logic         = logicTuple[3]
   filename = os.path.join(filePath, "%s_%s.yaml" \
       % (schedulePrefix, str(problemType)))
   print2("# writeLogic( %s )" % ( filename ))
 
-  # open file
+  data = []
+  # logic name
+  data.append(globalParameters["Name"])
+  # problem type
+  problemTypeState = problemType.state
+  problemTypeState["DataType"] = \
+      problemTypeState["DataType"].value
+  data.append(problemTypeState)
+  # solutions
+  solutionList = []
+  for solution in solutions:
+    solutionState = solution.state
+    solutionState["ProblemType"] = solutionState["ProblemType"].state
+    solutionState["ProblemType"]["DataType"] = \
+        solutionState["ProblemType"]["DataType"].value
+    solutionList.append(solutionState)
+  data.append(solutionList)
+  # index order
+  data.append(indexOrder)
+  # logic
+  data.append(logic)
+
+  # open & write file
   try:
     stream = open(filename, "w")
+    #yaml.dump(data, stream, default_flow_style=False)
+    yaml.dump(data, stream)
+    stream.close()
   except IOError:
     printExit("Cannot open file: %s" % filename)
 
+  """
+  #data = [ globalParameters["Name"], problemTypeState, [], [], [] ]
   # write problem type
   problemTypeState = problemType.state
   problemTypeState["DataType"] = \
       problemTypeState["DataType"].value
-  data = [ globalParameters["Name"], problemTypeState, [], [], [], [] ]
   for solution in solutions:
     solutionState = solution.state
     solutionState["ProblemType"] = solutionState["ProblemType"].state
@@ -122,6 +147,7 @@ def writeLibraryLogicForProblemType( filePath, schedulePrefix, logic):
   #stream.write(data)
   yaml.dump(data, stream, default_flow_style=False)
   stream.close()
+  """
 
 
 def readLibraryLogicForProblemType( filename ):
@@ -134,19 +160,20 @@ def readLibraryLogicForProblemType( filename ):
   stream.close()
 
   # verify
-  if len(data) < 6:
+  if len(data) < 5:
     printExit("len(%s) %u < 6" % (filename, len(data)))
 
   # parse out objects
   scheduleName = data[0]
   problemTypeState = data[1]
   solutionStates = data[2]
-  skinnyLogic0 = data[3]
-  skinnyLogic1 = data[4]
-  diagonalLogic = data[5]
+  indexOrder = data[3]
+  logic = data[4]
 
-  solutions = []
+  # unpack problemType
   problemType = ProblemType(problemTypeState)
+  # unpack solutions
+  solutions = []
   for i in range(0, len(solutionStates)):
     solutionState = solutionStates[i]
     solutionObject = Solution(solutionState)
@@ -155,5 +182,4 @@ def readLibraryLogicForProblemType( filename ):
           % (problemType, solutionObject["ProblemType"]))
     solutions.append(solutionObject)
 
-  return (scheduleName, problemType, solutions, skinnyLogic0, skinnyLogic1, \
-      diagonalLogic)
+  return (scheduleName, problemType, solutions, indexOrder, logic )

From c73e3ac537099f4c58d5e2e90e36bf1bf0adbe1c Mon Sep 17 00:00:00 2001
From: David Tanner <guacamoleo@gmail.com>
Date: Fri, 3 Mar 2017 08:46:07 -0600
Subject: [PATCH 17/21] recursive solution selection logic working

---
 Tensile/Common.py               |  14 +---
 Tensile/LibraryLogic.py         |  46 +++++++----
 Tensile/TensileCreateLibrary.py | 131 +++++++++++++-------------------
 Tensile/__init__.py             |   2 +-
 4 files changed, 90 insertions(+), 103 deletions(-)

diff --git a/Tensile/Common.py b/Tensile/Common.py
index 9b04526c0..c2b8f54b3 100644
--- a/Tensile/Common.py
+++ b/Tensile/Common.py
@@ -152,16 +152,10 @@
 # Default Analysis Parameters
 ################################################################################
 defaultAnalysisParameters = {
-    "InitialSolutionWindow":     4,
-    "BranchWeight":              100, # microseconds / kernel
-
-    "Dilation":                 3,
-    "Threshold":                0.1,
-    "OutlierThreshold":         0,
-    "FractionTimeSavedMin":     0.01, # = 1%
-    "Weight0":                  100,
-    "Weight1":                  100,
-    "Weight2":                  100,
+    "InitialSolutionWindow":      4,
+    "BranchPenalty":           10000, # microseconds / kernel
+    "SmoothOutliers":         False, # enforce monotonic data
+    "SolutionImportanceMin":   0.01, # = 1%
     }
 
 
diff --git a/Tensile/LibraryLogic.py b/Tensile/LibraryLogic.py
index 3016f1c79..823d48452 100644
--- a/Tensile/LibraryLogic.py
+++ b/Tensile/LibraryLogic.py
@@ -50,8 +50,9 @@ def analyzeProblemType( problemTypeTuple, inputParameters ):
 
   ######################################
   # Correct outliers
-  # logicAnalyzer.smooth()
-  logicAnalyzer.print2D([0, 0])
+  if inputParameters["SmoothOutliers"]:
+    logicAnalyzer.smoothOutliers()
+  #logicAnalyzer.print2D([0, 0])
 
   ######################################
   # Create Rules
@@ -65,6 +66,8 @@ def analyzeProblemType( problemTypeTuple, inputParameters ):
       logicAnalyzer.globalIndexRange, logic)
   print1("\nScore: %.0f ms" % (score/1000))
 
+  logicAnalyzer.prepareLogic(logic)
+
   #return (skinnyRules01, skinnyRules10, diagonalRules)
   #return (problemType, logicAnalyzer.solutionsUsed, [], [], logicAnalyzer.diagonalRules )
   return (problemType, logicAnalyzer.solutions, logicAnalyzer.indexOrder, logic)
@@ -152,9 +155,6 @@ def __init__(self, problemType, problemSizes, solutions, inputParameters):
     self.rangeIndicesFree = range(0, self.problemType["NumIndicesC"])
     self.rangeIndicesSummation = range(self.problemType["NumIndicesC"], \
         self.problemType["TotalIndices"])
-    self.w0 = self.parameters["Weight0"]
-    self.w1 = self.parameters["Weight1"]
-    self.w2 = self.parameters["Weight2"]
     self.indexOrder = self.recommendedIndexOrder()
     print2("IndexOrder: %s" % self.indexOrder)
     self.globalIndexRange = []
@@ -248,7 +248,7 @@ def removeLeastImportantSolutions(self):
     while True:
       (lisIdx, lisPercSaved, lisPercWins, lisPercExec) \
           = self.leastImportantSolution()
-      if lisPercSaved < self.parameters["FractionTimeSavedMin"]:
+      if lisPercSaved < self.parameters["SolutionImportanceMin"]:
         self.removeSolution(lisIdx)
         continue
       else:
@@ -256,10 +256,9 @@ def removeLeastImportantSolutions(self):
 
 
   ##############################################################################
-  # ENTRY: Smooth - correct outliers
+  # ENTRY: Smooth Outliers
   ##############################################################################
-  def smooth(self):
-    outlierThreshold = self.parameters["OutlierThreshold"]
+  def smoothOutliers(self):
     problemSizes = [0]*self.numIndices
     for problemIndices in self.problemIndicesForGlobalRange:
       problemSerial = self.indicesToSerial(0, problemIndices)
@@ -281,8 +280,8 @@ def smooth(self):
             neighborAfterGFlops = self.data[neighborAfterIdx+solutionIdx]
             neighborGFlops.append(neighborBeforeGFlops)
             neighborGFlops.append(neighborAfterGFlops)
-            if neighborBeforeGFlops > gflops * (1+outlierThreshold) \
-                and neighborAfterGFlops * (1+outlierThreshold) < gflops :
+            if neighborBeforeGFlops > gflops \
+                and neighborAfterGFlops < gflops :
               smoothProblem = True
         if smoothProblem:
           s = ""
@@ -442,7 +441,7 @@ def enRule(self, currentIndexIndex, currentIndexRange):
           logicComplexity = [0]*self.numIndices
           self.scoreLogicComplexity( \
               [priorRuleForSize], logicComplexity)
-          priorRuleScore += self.parameters["BranchWeight"] \
+          priorRuleScore += self.parameters["BranchPenalty"] \
               * sum(logicComplexity)
           # score candidate
           candidateRuleScore = self.scoreRangeForLogic(nextIndexRange, \
@@ -450,9 +449,9 @@ def enRule(self, currentIndexIndex, currentIndexRange):
           logicComplexity = [0]*self.numIndices
           self.scoreLogicComplexity( \
               [candidateRule], logicComplexity)
-          candidateRuleScore += self.parameters["BranchWeight"] \
+          candidateRuleScore += self.parameters["BranchPenalty"] \
               * sum(logicComplexity)
-          candidateRuleScore += self.parameters["BranchWeight"] # penalize
+          candidateRuleScore += self.parameters["BranchPenalty"] # penalize
           candidateFaster = candidateRuleScore < priorRuleScore
           print2("%sP[%2u]: %s %s~%.0fus < %s~%.0fus" % (tab, problemIndex, \
               "wins" if candidateFaster else "same", \
@@ -487,6 +486,25 @@ def enRule(self, currentIndexIndex, currentIndexRange):
   ##############################################################################
 
 
+
+  ##############################################################################
+  # Prepare Logic
+  # convert threshold indices to sizes
+  # last threshold = -1
+  ##############################################################################
+  def prepareLogic(self, logic):
+    depth = self.getLogicDepth(logic)
+    if depth == 0: return
+    indexIndex = self.numIndices - depth
+    index = self.indexOrder[indexIndex]
+    for i in range(0, len(logic)):
+      if i == len(logic)-1:
+        logic[i][0] = -1
+      else:
+        logic[i][0] = self.problemIndexToSize[index][logic[i][0]]
+      self.prepareLogic(logic[i][1])
+
+
   ##############################################################################
   # Print2D
   ##############################################################################
diff --git a/Tensile/TensileCreateLibrary.py b/Tensile/TensileCreateLibrary.py
index a17366852..222a71d55 100644
--- a/Tensile/TensileCreateLibrary.py
+++ b/Tensile/TensileCreateLibrary.py
@@ -207,6 +207,7 @@ def writeLogic(outputPath, logicList, solutionWriter ):
     for i in range(0, len(argList)):
       s += "    %s%s" % (argList[i], ",\n" if i < len(argList)-1 else ") {\n\n")
 
+    """
     indent = "  "
     s += "%ssize_t sizeC = size%s" % ( indent, indexChars[0])
     for i in range(1, problemType["NumIndicesC"]):
@@ -217,64 +218,11 @@ def writeLogic(outputPath, logicList, solutionWriter ):
     for i in range(1, len(problemType["IndicesSummation"])):
       s += "*size%s" % indexChars[problemType["IndicesSummation"][i]]
     s += ";\n\n"
-    #for rule in skinnyLogic0:
-    #  print2(rule)
-    #for rule in skinnyLogic1:
-    #  print2(rule)
-    print2(solutionNames)
-
-    logicStr = writeLogicRec(0, indexOrder, logic, solutionNames)
-    print logicStr
-    printExit("TODO")
-    #for indexIndex in range(0, problemType["TotalIndices"]):
-    #  index = indexOrder[indexIndex]
-    #  for ruleIdx in range(0, len(logic)):
-    #    rule = logic[ruleIdx]
-
-
     """
-      print2(rule)
-      winnerIdx = rule[0]
-      problemSize = rule[1]
-      minGFlops = rule[2]
-      maxGFlops = rule[3]
-      # rule logic
-      if ruleIdx == len(diagonalLogic)-1:
-        if len(diagonalLogic) > 1:
-          s += "%selse" % indent
-        else:
-          s += "%s" % indent
-      else:
-        s += "%s%s(sizeC >= static_cast<size_t>(%u" % (indent, ("if" if ruleIdx == 0 else "else if"), problemSize[0])
-        for i in range(1, problemType["NumIndicesC"]):
-          s += "*%u" % problemSize[i]
-        s += "))"
-      s += " return %s(" % solutionNames[winnerIdx]
-      # solution parameters
-      s += " dataC, dataA, dataB, alpha"
-      if problemType["UseBeta"]:
-        s += ", beta"
-      s += ", offsetC, offsetA, offsetB"
-      firstStride = 1
-      if problemType["UseInitialStrides"]:
-        firstStride = 0
-      lastStrideC = problemType["NumIndicesC"]
-      lastStrideA = len(problemType["IndexAssignmentsA"])
-      lastStrideB = len(problemType["IndexAssignmentsB"])
-
-      for i in range(firstStride,lastStrideC):
-        s += ", strideC%u%s" % (i, indexChars[i])
-      for i in range(firstStride,lastStrideA):
-        s += ", strideA%u%s" % (i, \
-            indexChars[problemType["IndexAssignmentsA"][i]])
-      for i in range(firstStride,lastStrideB):
-        s += ", strideB%u%s" % (i, \
-            indexChars[problemType["IndexAssignmentsB"][i]])
-      for i in range(0, problemType["TotalIndices"]):
-        s += ", size%s" % indexChars[i]
-      s += ", stream, numInputEvents, inputEvents, outputEvent ); /* [%f,%f] GFlops*/\n" % (minGFlops,maxGFlops)
+    print2(solutionNames)
 
-    """
+    logicStr = writeLogicRec(0, indexOrder, logic, solutionNames, problemType)
+    s += logicStr
     s += "\n}\n"
 
     # open and close individual files
@@ -299,43 +247,70 @@ def writeLogic(outputPath, logicList, solutionWriter ):
 ################################################################################
 # Write Logic Recursive
 ################################################################################
-def writeLogicRec(depth, indexOrder, logic, solutionNames):
+def writeLogicRec(depth, indexOrder, logic, solutionNames, problemType):
   indexChars = globalParameters["IndexChars"]
   indent = "  "
   indent += "  "*depth
   s = ""
   lowestLevel = depth == len(indexOrder)-1
   numRules = len(logic)
-  if numRules > 1:
-    # multiple rules, need if/else
-    for ruleIdx in range(0, numRules):
-      rule = logic[ruleIdx]
-      threshold = rule[0]
-      if lowestLevel:
-        solutionIdx = rule[1]
-        s += "%sif (size%s < %u) return solution[%u];\n" \
-            % (indent, indexChars[indexOrder[depth]], threshold, solutionIdx)
-      else:
-        s += "%sif (size%s < %u) {\n" \
-            % (indent, indexChars[indexOrder[depth]], threshold)
-        s += writeLogicRec(depth+1, indexOrder, rule[1], solutionNames)
-        s += "%s}\n" % (indent)
-  else:
-    ruleIdx = 0
+  for ruleIdx in range(0, numRules):
     rule = logic[ruleIdx]
     threshold = rule[0]
     if lowestLevel:
       solutionIdx = rule[1]
-      s += "%sreturn solution[%u];\n" \
-          % (indent, solutionIdx)
+      solutionCall = writeSolutionCall(solutionNames[solutionIdx],problemType)
+      if threshold > 0:
+        s += "%sif (size%s < %u) return %s;\n" \
+            % (indent, indexChars[indexOrder[depth]], threshold, solutionCall)
+      else:
+        s += "%sreturn %s;\n" % (indent, solutionCall)
     else:
-      s += "%s{\n" \
-          % (indent)
-      s += writeLogicRec(depth+1, indexOrder, rule[1], solutionNames)
+      if threshold > 0:
+        s += "%sif (size%s < %u) {\n" \
+            % (indent, indexChars[indexOrder[depth]], threshold)
+      else:
+        s += "%s{\n" % (indent)
+      s += writeLogicRec(depth+1, indexOrder, rule[1], solutionNames, \
+          problemType)
       s += "%s}\n" % (indent)
   return s
 
 
+################################################################################
+# Write Solution Call
+################################################################################
+def writeSolutionCall(solutionName, problemType):
+  indexChars = globalParameters["IndexChars"]
+  s = ""
+  s += "%s(" % solutionName
+  # solution parameters
+  s += " dataC, dataA, dataB, alpha"
+  if problemType["UseBeta"]:
+    s += ", beta"
+  s += ", offsetC, offsetA, offsetB"
+  firstStride = 1
+  if problemType["UseInitialStrides"]:
+    firstStride = 0
+  lastStrideC = problemType["NumIndicesC"]
+  lastStrideA = len(problemType["IndexAssignmentsA"])
+  lastStrideB = len(problemType["IndexAssignmentsB"])
+  for i in range(firstStride,lastStrideC):
+    s += ", strideC%u%s" % (i, indexChars[i])
+  for i in range(firstStride,lastStrideA):
+    s += ", strideA%u%s" % (i, \
+        indexChars[problemType["IndexAssignmentsA"][i]])
+  for i in range(firstStride,lastStrideB):
+    s += ", strideB%u%s" % (i, \
+        indexChars[problemType["IndexAssignmentsB"][i]])
+  for i in range(0, problemType["TotalIndices"]):
+    s += ", size%s" % indexChars[i]
+  s += ", stream, numInputEvents, inputEvents, outputEvent )"
+  return s
+
+
+
+
 ################################################################################
 # Write CMake
 ################################################################################
diff --git a/Tensile/__init__.py b/Tensile/__init__.py
index edc60b350..8a124bf64 100644
--- a/Tensile/__init__.py
+++ b/Tensile/__init__.py
@@ -1 +1 @@
-__version__ = "2.1.6"
+__version__ = "2.2.0"

From 88636743bd2c6687ef0a3c070dc8b86097acb8b9 Mon Sep 17 00:00:00 2001
From: David Tanner <guacamoleo@gmail.com>
Date: Fri, 3 Mar 2017 09:13:26 -0600
Subject: [PATCH 18/21] added api timer to library client, runscript prints
 path to library client

---
 Tensile/ClientWriter.py | 13 ++++++++-----
 Tensile/Source/Client.h | 43 ++++++++++++++++++++++++++---------------
 2 files changed, 35 insertions(+), 21 deletions(-)

diff --git a/Tensile/ClientWriter.py b/Tensile/ClientWriter.py
index 2ee9ba083..dd6e746fa 100644
--- a/Tensile/ClientWriter.py
+++ b/Tensile/ClientWriter.py
@@ -142,13 +142,16 @@ def writeRunScript(path, libraryLogicPath, forBenchmark):
       if os.name != "nt" else "") )
   #if os.name != "nt":
   #  runScriptFile.write("find .\n")
-  runScriptFile.write("%s & echo %s & echo # Running Client & echo %s\n" \
-      % (echoLine, HR, HR))
+  #    % (echoLine, HR, HR))
+  executablePath = os.path.join(globalParameters["WorkingPath"])
   if os.name == "nt":
-    runScriptFile.write(os.path.join(globalParameters["CMakeBuildType"], \
-        "client.exe") )
+    executablePath = os.path.join(executablePath, \
+        globalParameters["CMakeBuildType"], \
+        "client.exe")
   else:
-    runScriptFile.write("./client")
+    executablePath = os.path.join(executablePath, "client")
+  runScriptFile.write("%s & echo %s & echo # Library Client Path: & echo %s\n" \
+      % (echoLine, HR, executablePath) )
   runScriptFile.close()
   if os.name != "nt":
     os.chmod(runScriptName, 0777)
diff --git a/Tensile/Source/Client.h b/Tensile/Source/Client.h
index eee665ca4..e0fc00446 100644
--- a/Tensile/Source/Client.h
+++ b/Tensile/Source/Client.h
@@ -30,6 +30,7 @@
 #include <fstream>
 
 TensileTimer timer;
+TensileTimer apiTimer;
 std::ofstream file;
 
 void initControls();
@@ -135,7 +136,8 @@ bool callLibrary(
             std::cout << "  Device | Reference" << std::endl;
             firstPrint = false;
           }
-          std::cout << "[" << (numChecked-1) << "] " << i << ": " << tensileToString(deviceOnHostC[i])
+          std::cout << "[" << (numChecked-1) << "] " << i << ": "
+            << tensileToString(deviceOnHostC[i])
             << (equal ? "==" : "!=") << tensileToString(referenceC[i])
             << std::endl;
           printIdx++;
@@ -146,10 +148,14 @@ bool callLibrary(
 
   // time solution
   timer.start();
+  double apiTimeUs = 0;
   for (unsigned int syncIdx = 0; syncIdx < numSyncsPerBenchmark; syncIdx++) {
+    apiTimer.start();
     for (unsigned int enqIdx = 0; enqIdx < numEnqueuesPerSync; enqIdx++) {
       generatedCallToFunction( userSizes, alpha, beta );
     }
+    double currentApiTimeUs = apiTimer.elapsed_us() / numEnqueuesPerSync;
+    apiTimeUs += currentApiTimeUs;
     // sync
 #if Tensile_BACKEND_OCL
     status = clFinish(stream); tensileStatusCheck(status);
@@ -174,30 +180,35 @@ bool callLibrary(
       << numFunctions << "]:"
       << std::setw(10) << std::fixed << std::setprecision(3)
       << gflops << " GFlop/s";
-      if (newFastest) {
-        std::cout << "*";
-      } else {
-        std::cout << " ";
-      }
+    if (newFastest) {
+      std::cout << "*";
+    } else {
+      std::cout << " ";
+    }
     std::cout << " |"
       << std::setw(9) << std::fixed << std::setprecision(3) << timeMs
       << " ms | v: " << (numInvalids ? "FAILED" : "PASSED")
-      << " p: " << (numChecked-numInvalids) << "/" << numChecked << std::endl;
+      << " p: " << (numChecked-numInvalids) << "/" << numChecked;
+    std::cout << " api:" << std::setw(6) << std::fixed 
+      << std::setprecision(3) << apiTimeUs << " us";
+    std::cout << std::endl;
   } else {
     std::cout << "Function[" << functionIdx << "/" << numFunctions << "]:"
       << std::setw(10) << std::fixed << std::setprecision(3)
       << gflops << " GFlop/s";
-      if (newFastest) {
-        std::cout << "*";
-      } else {
-        std::cout << " ";
-      }
+    if (newFastest) {
+      std::cout << "*";
+    } else {
+      std::cout << " ";
+    }
     std::cout << " |"
       << std::setw(9) << std::fixed << std::setprecision(3) << timeMs << " ms";
-      if (newFastest) {
-        std::cout << "*";
-      }
-      std::cout << std::endl;
+    if (newFastest) {
+      std::cout << "*";
+    }
+    std::cout << " api:" << std::setw(6) << std::fixed 
+      << std::setprecision(3) << apiTimeUs << " us";
+    std::cout << std::endl;
   }
   return (numInvalids > 0);
 } // callLibrary

From 71edbd658fbea17fa44ed285e344133540072235 Mon Sep 17 00:00:00 2001
From: David Tanner <guacamoleo@gmail.com>
Date: Fri, 3 Mar 2017 10:04:01 -0600
Subject: [PATCH 19/21] client prints api time

---
 Tensile/Source/Client.h | 9 +++++----
 1 file changed, 5 insertions(+), 4 deletions(-)

diff --git a/Tensile/Source/Client.h b/Tensile/Source/Client.h
index e0fc00446..401b6ab8f 100644
--- a/Tensile/Source/Client.h
+++ b/Tensile/Source/Client.h
@@ -164,6 +164,7 @@ bool callLibrary(
 #endif
     tensileStatusCheck(status);
   } // sync loop
+  apiTimeUs /= numSyncsPerBenchmark;
 
   double timeMs = timer.elapsed_ms()
     / numSyncsPerBenchmark / numEnqueuesPerSync;
@@ -188,8 +189,8 @@ bool callLibrary(
     std::cout << " |"
       << std::setw(9) << std::fixed << std::setprecision(3) << timeMs
       << " ms | v: " << (numInvalids ? "FAILED" : "PASSED")
-      << " p: " << (numChecked-numInvalids) << "/" << numChecked;
-    std::cout << " api:" << std::setw(6) << std::fixed 
+      << " " << (numChecked-numInvalids) << "/" << numChecked;
+    std::cout << " | api:" << std::setw(6) << std::fixed 
       << std::setprecision(3) << apiTimeUs << " us";
     std::cout << std::endl;
   } else {
@@ -206,7 +207,7 @@ bool callLibrary(
     if (newFastest) {
       std::cout << "*";
     }
-    std::cout << " api:" << std::setw(6) << std::fixed 
+    std::cout << " | api:" << std::setw(6) << std::fixed 
       << std::setprecision(3) << apiTimeUs << " us";
     std::cout << std::endl;
   }
@@ -370,7 +371,7 @@ bool benchmarkAllSolutionsForSize(
       }
       std::cout << " |"
         << std::setw(9) << std::fixed << std::setprecision(3) << timeMs << " ms | v: " << (numInvalids ? "FAILED" : "PASSED")
-        << " p: " << (numChecked-numInvalids) << "/" << numChecked << std::endl;
+        << " " << (numChecked-numInvalids) << "/" << numChecked << std::endl;
     }
 #if 1
     else {

From 40d5315765da6fa24d986c31ae76039d09b17240 Mon Sep 17 00:00:00 2001
From: David Tanner <guacamoleo@gmail.com>
Date: Fri, 3 Mar 2017 13:42:31 -0600
Subject: [PATCH 20/21] fixed clientwriter executing exe for benchmarks but
 printing exe only for library

---
 Tensile/ClientWriter.py                 | 26 ++++++++++++++-----------
 Tensile/Configs/rocblas_cgemm.yaml      |  3 +--
 Tensile/Configs/rocblas_dgemm.yaml      |  3 +--
 Tensile/Configs/rocblas_sgemm.yaml      |  3 +--
 Tensile/Configs/rocblas_zgemm.yaml      |  3 +--
 Tensile/Configs/sgemm.yaml              |  3 +--
 Tensile/Configs/tensor_contraction.yaml |  3 +--
 7 files changed, 21 insertions(+), 23 deletions(-)

diff --git a/Tensile/ClientWriter.py b/Tensile/ClientWriter.py
index dd6e746fa..de391a40e 100644
--- a/Tensile/ClientWriter.py
+++ b/Tensile/ClientWriter.py
@@ -140,18 +140,22 @@ def writeRunScript(path, libraryLogicPath, forBenchmark):
   runScriptFile.write("cmake --build . --config %s%s\n" \
       % (globalParameters["CMakeBuildType"], " -- -j 8" \
       if os.name != "nt" else "") )
-  #if os.name != "nt":
-  #  runScriptFile.write("find .\n")
-  #    % (echoLine, HR, HR))
-  executablePath = os.path.join(globalParameters["WorkingPath"])
-  if os.name == "nt":
-    executablePath = os.path.join(executablePath, \
-        globalParameters["CMakeBuildType"], \
-        "client.exe")
+  if forBenchmark:
+    if os.name == "nt":
+      runScriptFile.write(os.path.join(globalParameters["CMakeBuildType"], \
+          "client.exe") )
+    else:
+      runScriptFile.write("./client")
   else:
-    executablePath = os.path.join(executablePath, "client")
-  runScriptFile.write("%s & echo %s & echo # Library Client Path: & echo %s\n" \
-      % (echoLine, HR, executablePath) )
+    executablePath = os.path.join(globalParameters["WorkingPath"])
+    if os.name == "nt":
+      executablePath = os.path.join(executablePath, \
+          globalParameters["CMakeBuildType"], \
+          "client.exe")
+    else:
+      executablePath = os.path.join(executablePath, "client")
+    runScriptFile.write("%s & echo %s & echo # Library Client Path: & echo %s\n" \
+        % (echoLine, HR, executablePath) )
   runScriptFile.close()
   if os.name != "nt":
     os.chmod(runScriptName, 0777)
diff --git a/Tensile/Configs/rocblas_cgemm.yaml b/Tensile/Configs/rocblas_cgemm.yaml
index 3c0fbd958..40e68b3a2 100644
--- a/Tensile/Configs/rocblas_cgemm.yaml
+++ b/Tensile/Configs/rocblas_cgemm.yaml
@@ -151,7 +151,6 @@ BenchmarkProblems:
       - ProblemSizes: [ [32, 32, 32, 4000], [32, 32, 32, 4000], [2], [1536] ]
 
 LibraryLogic:
-  Dilation: 3
-  Threshold: 0.1
+  BranchPenalty: 0
 
 LibraryClient:
diff --git a/Tensile/Configs/rocblas_dgemm.yaml b/Tensile/Configs/rocblas_dgemm.yaml
index f4bc58257..51e8eea11 100644
--- a/Tensile/Configs/rocblas_dgemm.yaml
+++ b/Tensile/Configs/rocblas_dgemm.yaml
@@ -151,7 +151,6 @@ BenchmarkProblems:
       - ProblemSizes: [ [32, 32, 32, 4000], [32, 32, 32, 4000], [2], [1536] ]
 
 LibraryLogic:
-  Dilation: 3
-  Threshold: 0.1
+  BranchPenalty: 0
 
 LibraryClient:
diff --git a/Tensile/Configs/rocblas_sgemm.yaml b/Tensile/Configs/rocblas_sgemm.yaml
index fedec0deb..c4ed34214 100644
--- a/Tensile/Configs/rocblas_sgemm.yaml
+++ b/Tensile/Configs/rocblas_sgemm.yaml
@@ -151,7 +151,6 @@ BenchmarkProblems:
       - ProblemSizes: [ [32, 32, 32, 4000], [32, 32, 32, 4000], [2], [2880] ]
 
 LibraryLogic:
-  Dilation: 3
-  Threshold: 0.1
+  BranchPenalty: 0
 
 LibraryClient:
diff --git a/Tensile/Configs/rocblas_zgemm.yaml b/Tensile/Configs/rocblas_zgemm.yaml
index 64f0f6af9..4082ed6c8 100644
--- a/Tensile/Configs/rocblas_zgemm.yaml
+++ b/Tensile/Configs/rocblas_zgemm.yaml
@@ -151,7 +151,6 @@ BenchmarkProblems:
       - ProblemSizes: [ [32, 32, 32, 4000], [32, 32, 32, 4000], [2], [1536] ]
 
 LibraryLogic:
-  Dilation: 3
-  Threshold: 0.1
+  BranchPenalty: 0
 
 LibraryClient:
diff --git a/Tensile/Configs/sgemm.yaml b/Tensile/Configs/sgemm.yaml
index 7d2805e4a..f9ddad0c5 100644
--- a/Tensile/Configs/sgemm.yaml
+++ b/Tensile/Configs/sgemm.yaml
@@ -49,7 +49,6 @@ BenchmarkProblems:
       - ProblemSizes: [ [16, 128], [16, 128], [256] ]
 
 LibraryLogic:
-  Dilation: 3
-  Threshold: 0.1
+  BranchPenalty: 0
 
 LibraryClient:
diff --git a/Tensile/Configs/tensor_contraction.yaml b/Tensile/Configs/tensor_contraction.yaml
index 8201d0bec..aa5e779f4 100644
--- a/Tensile/Configs/tensor_contraction.yaml
+++ b/Tensile/Configs/tensor_contraction.yaml
@@ -50,7 +50,6 @@ BenchmarkProblems:
       - ProblemSizes: [ [16, 128], [16, 128], [2, 2, 4], [256] ]
 
 LibraryLogic:
-  Dilation: 3
-  Threshold: 0.1
+  BranchPenalty: 0
 
 LibraryClient:

From 900d32fb8b811cbd0a9ee462f24e984688ab5e3d Mon Sep 17 00:00:00 2001
From: David Tanner <guacamoleo@gmail.com>
Date: Fri, 3 Mar 2017 15:50:15 -0600
Subject: [PATCH 21/21] lowering default BranchPenalty since logic is
 inexpensive (1.5us)

---
 Tensile/Common.py | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/Tensile/Common.py b/Tensile/Common.py
index c2b8f54b3..2e3eb3151 100644
--- a/Tensile/Common.py
+++ b/Tensile/Common.py
@@ -153,7 +153,7 @@
 ################################################################################
 defaultAnalysisParameters = {
     "InitialSolutionWindow":      4,
-    "BranchPenalty":           10000, # microseconds / kernel
+    "BranchPenalty":            100, # microseconds / kernel
     "SmoothOutliers":         False, # enforce monotonic data
     "SolutionImportanceMin":   0.01, # = 1%
     }