diff --git a/Tensile/BenchmarkProblems.py b/Tensile/BenchmarkProblems.py
index 8ec4ed719..62d9867b4 100644
--- a/Tensile/BenchmarkProblems.py
+++ b/Tensile/BenchmarkProblems.py
@@ -37,7 +37,7 @@ def benchmarkProblemType( config ):
 
   totalBenchmarkSteps = len(benchmarkProcess)
   winners = WinningParameterDict()
-  print1("NumBenchmarkSteps: %u" % totalBenchmarkSteps)
+  print1("# NumBenchmarkSteps: %u" % totalBenchmarkSteps)
   print1("")
   print1(HR)
   print1("# Done Creating BenchmarkProcess Object")
@@ -148,6 +148,7 @@ def benchmarkProblemType( config ):
     # Enumerate Solutions = Hardcoded * Benchmark
     ############################################################################
     sys.stdout.write("# Enumerating Solutions")
+    solutionSet = set() # avoid duplicates for nlca=-1, 1
     for hardcodedIdx in range(0, numHardcoded):
       solutions.append([])
       hardcodedParamDict = benchmarkStep.hardcodedParameters[hardcodedIdx]
@@ -169,10 +170,15 @@ def benchmarkProblemType( config ):
                 benchmarkStep.initialSolutionParameters[initialSolutionParameterName]
         # TODO check if solution matches problem size for exact tile kernels
         solutionObject = Solution(solution)
-        if SolutionWriter.solutionParametersConsistent(solutionObject):
-          solutions[hardcodedIdx].append(solutionObject)
-          if globalParameters["PrintLevel"] >= 1:
-            sys.stdout.write("|")
+        if solutionObject["Valid"]:
+          if solutionObject not in solutionSet:
+            solutionSet.add(solutionObject)
+            solutions[hardcodedIdx].append(solutionObject)
+            if globalParameters["PrintLevel"] >= 1:
+              sys.stdout.write("|")
+          else:
+            if globalParameters["PrintLevel"] >= 1:
+              sys.stdout.write(":")
         else:
           if globalParameters["PrintLevel"] >= 1:
             sys.stdout.write(".")
@@ -541,8 +547,8 @@ def get( lookupHardcodedParameters, winners ):
           #for paramName in hardcodedFrozen:
           #  paramValue = hardcodedFrozen[paramName]
           #  matchUnion[paramName] = paramValue
-          Solution.assignDimsFromEdgeAndShape(matchUnion)
-          Solution.assignDimsFromEdgeAndShape(hardcodedFrozen.parameters)
+          Solution.assignProblemIndependentDerivedParameters(matchUnion)
+          Solution.assignProblemIndependentDerivedParameters(hardcodedFrozen.parameters)
           if matchUnion["MacroTile0"] != lookupMacroTile0 \
               or matchUnion["MacroTile1"] != lookupMacroTile1:
             matchMacroTile = False
diff --git a/Tensile/BenchmarkStructs.py b/Tensile/BenchmarkStructs.py
index b119001ae..48bbf9813 100644
--- a/Tensile/BenchmarkStructs.py
+++ b/Tensile/BenchmarkStructs.py
@@ -2,7 +2,7 @@
 # from copy import *
 
 from copy import copy, deepcopy
-from Common import print1, print2, printWarning, defaultSolution, defaultProblemSizes, defaultBenchmarkFinalProblemSizes, defaultBenchmarkCommonParameters, hasParam, defaultBenchmarkJoinParameters, getParamValues, defaultForkParameters, defaultBenchmarkForkParameters, defaultJoinParameters, printExit
+from Common import print1, print2, printWarning, defaultSolution, defaultProblemSizes, defaultBenchmarkFinalProblemSizes, defaultBenchmarkCommonParameters, hasParam, defaultBenchmarkJoinParameters, getParamValues, defaultForkParameters, defaultBenchmarkForkParameters, defaultJoinParameters, printExit, globalParameters
 from SolutionStructs import Solution, ProblemType, ProblemSizes
 
 ################################################################################
@@ -399,14 +399,16 @@ def convertParametersToSteps(self):
           macroTileDim0 = workGroupEdgeValues[workGroupEdgeIdx]*threadTileEdgeValues[threadTileEdgeIdx]
           macroTileDim1 = macroTileDim0
           if workGroupShapeValues[workGroupShapeIdx] < 0:
-            macroTileDim1 /= 2
+            macroTileDim0 *= abs(workGroupShapeValues[workGroupShapeIdx])
           elif workGroupShapeValues[workGroupShapeIdx] > 0:
-            macroTileDim1 *= 2
+            macroTileDim1 *= abs(workGroupShapeValues[workGroupShapeIdx])
           if threadTileShapeValues[threadTileShapeIdx] < 0:
-            macroTileDim1 /= 2
+            macroTileDim0 *= abs(threadTileShapeValues[threadTileShapeIdx])
           elif threadTileShapeValues[threadTileShapeIdx] > 0:
-            macroTileDim1 *= 2
-          if macroTileDim0/macroTileDim1 <= self.initialSolutionParameters["MacroTileMaxRatio"] and macroTileDim1/macroTileDim0 <= self.initialSolutionParameters["MacroTileMaxRatio"]:
+            macroTileDim1 *= abs(threadTileShapeValues[threadTileShapeIdx])
+          # TODO is this still useful?
+          if macroTileDim0/macroTileDim1 <= globalParameters["MaxMacroTileRatio"] \
+              and macroTileDim1/macroTileDim0 <= globalParameters["MaxMacroTileRatio"]:
             macroTileJoinSet.add((macroTileDim0, macroTileDim1))
         totalPermutations *=len(macroTileJoinSet)
         print2("JoinMacroTileSet(%u): %s" % (len(macroTileJoinSet), macroTileJoinSet) )
diff --git a/Tensile/ClientWriter.py b/Tensile/ClientWriter.py
index f3289dcc8..de391a40e 100644
--- a/Tensile/ClientWriter.py
+++ b/Tensile/ClientWriter.py
@@ -1,4 +1,4 @@
-from Common import globalParameters, HR, pushWorkingPath, popWorkingPath, print1, CHeader, printExit
+from Common import globalParameters, HR, pushWorkingPath, popWorkingPath, print1, CHeader, printExit, printWarning
 from SolutionStructs import Solution
 from SolutionWriter import SolutionWriter
 import YAMLIO
@@ -54,13 +54,14 @@ def main( config ):
   ##############################################################################
   logicFiles = [os.path.join(libraryLogicPath, f) for f \
       in os.listdir(libraryLogicPath) \
-      if os.path.isfile(os.path.join(libraryLogicPath, f))]
+      if (os.path.isfile(os.path.join(libraryLogicPath, f)) \
+      and os.path.splitext(f)[1]==".yaml")]
   print1("LogicFiles: %s" % logicFiles)
   functions = []
   functionNames = []
   for logicFileName in logicFiles:
-    (scheduleName, problemType, solutionsForType, skinnyLogic0, skinnyLogic1, \
-        diagonalLogic) = YAMLIO.readLibraryLogicForProblemType(logicFileName)
+    (scheduleName, problemType, solutionsForType, indexOrder, logic) \
+        = YAMLIO.readLibraryLogicForProblemType(logicFileName)
     functions.append((scheduleName, problemType))
     functionNames.append("tensile_%s_%s" % (scheduleName, problemType))
 
@@ -139,15 +140,22 @@ def writeRunScript(path, libraryLogicPath, forBenchmark):
   runScriptFile.write("cmake --build . --config %s%s\n" \
       % (globalParameters["CMakeBuildType"], " -- -j 8" \
       if os.name != "nt" else "") )
-  #if os.name != "nt":
-  #  runScriptFile.write("find .\n")
-  runScriptFile.write("%s & echo %s & echo # Running Client & echo %s\n" \
-      % (echoLine, HR, HR))
-  if os.name == "nt":
-    runScriptFile.write(os.path.join(globalParameters["CMakeBuildType"], \
-        "client.exe") )
+  if forBenchmark:
+    if os.name == "nt":
+      runScriptFile.write(os.path.join(globalParameters["CMakeBuildType"], \
+          "client.exe") )
+    else:
+      runScriptFile.write("./client")
   else:
-    runScriptFile.write("./client")
+    executablePath = os.path.join(globalParameters["WorkingPath"])
+    if os.name == "nt":
+      executablePath = os.path.join(executablePath, \
+          globalParameters["CMakeBuildType"], \
+          "client.exe")
+    else:
+      executablePath = os.path.join(executablePath, "client")
+    runScriptFile.write("%s & echo %s & echo # Library Client Path: & echo %s\n" \
+        % (echoLine, HR, executablePath) )
   runScriptFile.close()
   if os.name != "nt":
     os.chmod(runScriptName, 0777)
diff --git a/Tensile/Common.py b/Tensile/Common.py
index 8576813b2..2e3eb3151 100644
--- a/Tensile/Common.py
+++ b/Tensile/Common.py
@@ -49,8 +49,10 @@
 globalParameters["DataInitType"] = 0 # 0=rand, 1=1, 2=serial
 # protect against invalid kernel
 globalParameters["MaxThreads"] = 256
-globalParameters["MaxRegisters"] = 256
+globalParameters["MinThreads"] = 64
 globalParameters["MaxLDS"] = 32768
+globalParameters["MaxMacroTileRatio"] = 4
+globalParameters["MaxThreadTile"] = 64
 
 
 ################################################################################
@@ -67,15 +69,17 @@
     {"EdgeType":                [ "Branch" ] }, # Shift
     {"EdgeMultiKernel":         [ False ] },
     {"PadLDS":                  [ 1 ] },
+    {"SplitU":                  [ 1 ] },
+    {"Prefetch":                [ False ] },
     ]
 # benchmark these solution independently
 defaultForkParameters = [
     {"WorkGroupEdge":           [ 16, 8 ] },
-    {"WorkGroupShape":          [ 0 ] }, # -1, 0, 1
+    {"WorkGroupShape":          [ 0 ] }, # -4, -2, 0, 2, 4
     {"ThreadTileEdge":          [ 1, 2, 4, 6, 8 ] },
-    {"ThreadTileShape":         [ 0 ] }, # -1, 0, 1
-    {"SplitU":                  [ 1 ] },
-    {"Prefetch":                [ False ] },
+    {"ThreadTileShape":         [ 0 ] }, # -4, -2, 0, 2, 4
+    {"NumLoadsCoalescedA":       [ 1, -1 ] },
+    {"NumLoadsCoalescedB":       [ 1, -1 ] },
     ]
 # keep one winner per solution and it affects which will win
 defaultBenchmarkForkParameters = [
@@ -88,18 +92,17 @@
     ]
 # keep one winner per solution and it would affect which solutions fastest
 defaultBenchmarkJoinParameters = [
-    {"NumLoadsCoalescedA":       [ 1, 2, 3, 4, 6, 8 ] },
-    {"NumLoadsCoalescedB":       [ 1, 2, 3, 4, 6, 8 ] },
     {"VectorWidthGlobalLoad":   [ 4 ] },
     {"VectorWidthGlobalStore":  [ 4 ] },
     {"VectorWidthLocalLoad":    [ 4 ] },
     {"VectorWidthLocalStore":   [ 4 ] },
     ]
 
-# derrived parameters may show up in solution dict but don't use for naming
-derrivedParameters = [
+# derived parameters may show up in solution dict but don't use for naming
+derivedParameters = [
     "MacroTile0",
     "MacroTile1",
+    "DepthU",
     "WorkGroup0",
     "WorkGroup1",
     "ThreadTile0",
@@ -108,6 +111,7 @@
     "NumLoadsB",
     "NumLoadsPerpendicularA",
     "NumLoadsPerpendicularB",
+    "NumThreads",
     ]
 
 # dictionary of defaults comprised for 1st option for each parameter
@@ -118,7 +122,6 @@
     for key, value in paramDict.iteritems():
       defaultSolution[key] = value[0]
 # other non-benchmark options for solutions
-defaultSolution["MacroTileMaxRatio"] = 2
 
 ################################################################################
 # Default Problem Type
@@ -149,8 +152,10 @@
 # Default Analysis Parameters
 ################################################################################
 defaultAnalysisParameters = {
-    "Dilation":                 3,
-    "Threshold":                0.1,
+    "InitialSolutionWindow":      4,
+    "BranchPenalty":            100, # microseconds / kernel
+    "SmoothOutliers":         False, # enforce monotonic data
+    "SolutionImportanceMin":   0.01, # = 1%
     }
 
 
diff --git a/Tensile/Configs/rocblas_cgemm.yaml b/Tensile/Configs/rocblas_cgemm.yaml
index b1c5e2257..40e68b3a2 100644
--- a/Tensile/Configs/rocblas_cgemm.yaml
+++ b/Tensile/Configs/rocblas_cgemm.yaml
@@ -41,8 +41,8 @@ BenchmarkProblems:
       - ThreadTileShape: [ 0 ]
       - WorkGroupEdge: [8, 16]
       - ThreadTileEdge: [2, 4, 6, 8]
-      - NumLoadsCoalescedA: [ 1, 2, 3, 4, 6, 8 ]
-      - NumLoadsCoalescedB: [ 1, 2, 3, 4, 6, 8 ]
+      - NumLoadsCoalescedA: [ 1, -1 ]
+      - NumLoadsCoalescedB: [ 1, -1 ]
       - LoopUnroll: [8, 16]
     BenchmarkForkParameters:
     JoinParameters:
@@ -74,8 +74,8 @@ BenchmarkProblems:
       - WorkGroupShape: [ 0 ]
       - ThreadTileEdge: [2, 4, 6, 8]
       - ThreadTileShape: [ 0 ]
-      - NumLoadsCoalescedA: [ 1, 2, 3, 4, 6, 8 ]
-      - NumLoadsCoalescedB: [ 1, 2, 3, 4, 6, 8 ]
+      - NumLoadsCoalescedA: [ 1, -1 ]
+      - NumLoadsCoalescedB: [ 1, -1 ]
       - LoopUnroll: [8, 16]
     BenchmarkForkParameters:
     JoinParameters:
@@ -107,8 +107,8 @@ BenchmarkProblems:
       - WorkGroupShape: [ 0 ]
       - ThreadTileEdge: [2, 4, 6, 8]
       - ThreadTileShape: [ 0 ]
-      - NumLoadsCoalescedA: [ 1, 2, 3, 4, 6, 8 ]
-      - NumLoadsCoalescedB: [ 1, 2, 3, 4, 6, 8 ]
+      - NumLoadsCoalescedA: [ 1, -1 ]
+      - NumLoadsCoalescedB: [ 1, -1 ]
       - LoopUnroll: [8, 16]
     BenchmarkForkParameters:
     JoinParameters:
@@ -140,8 +140,8 @@ BenchmarkProblems:
       - WorkGroupShape: [ 0 ]
       - ThreadTileEdge: [2, 4, 6, 8]
       - ThreadTileShape: [ 0 ]
-      - NumLoadsCoalescedA: [ 1, 2, 3, 4, 6, 8 ]
-      - NumLoadsCoalescedB: [ 1, 2, 3, 4, 6, 8 ]
+      - NumLoadsCoalescedA: [ 1, -1 ]
+      - NumLoadsCoalescedB: [ 1, -1 ]
       - LoopUnroll: [8, 16]
     BenchmarkForkParameters:
     JoinParameters:
@@ -151,7 +151,6 @@ BenchmarkProblems:
       - ProblemSizes: [ [32, 32, 32, 4000], [32, 32, 32, 4000], [2], [1536] ]
 
 LibraryLogic:
-  Dilation: 3
-  Threshold: 0.1
+  BranchPenalty: 0
 
 LibraryClient:
diff --git a/Tensile/Configs/rocblas_dgemm.yaml b/Tensile/Configs/rocblas_dgemm.yaml
index 36e358135..51e8eea11 100644
--- a/Tensile/Configs/rocblas_dgemm.yaml
+++ b/Tensile/Configs/rocblas_dgemm.yaml
@@ -41,8 +41,8 @@ BenchmarkProblems:
       - ThreadTileShape: [ 0 ]
       - WorkGroupEdge: [8, 16]
       - ThreadTileEdge: [2, 4, 6, 8]
-      - NumLoadsCoalescedA: [ 1, 2, 3, 4, 6, 8 ]
-      - NumLoadsCoalescedB: [ 1, 2, 3, 4, 6, 8 ]
+      - NumLoadsCoalescedA: [ 1, -1 ]
+      - NumLoadsCoalescedB: [ 1, -1 ]
       - LoopUnroll: [8, 16]
     BenchmarkForkParameters:
     JoinParameters:
@@ -74,8 +74,8 @@ BenchmarkProblems:
       - WorkGroupShape: [ 0 ]
       - ThreadTileEdge: [2, 4, 6, 8]
       - ThreadTileShape: [ 0 ]
-      - NumLoadsCoalescedA: [ 1, 2, 3, 4, 6, 8 ]
-      - NumLoadsCoalescedB: [ 1, 2, 3, 4, 6, 8 ]
+      - NumLoadsCoalescedA: [ 1, -1 ]
+      - NumLoadsCoalescedB: [ 1, -1 ]
       - LoopUnroll: [8, 16]
     BenchmarkForkParameters:
     JoinParameters:
@@ -107,8 +107,8 @@ BenchmarkProblems:
       - WorkGroupShape: [ 0 ]
       - ThreadTileEdge: [2, 4, 6, 8]
       - ThreadTileShape: [ 0, 1 ]
-      - NumLoadsCoalescedA: [ 1, 2, 3, 4, 6, 8 ]
-      - NumLoadsCoalescedB: [ 1, 2, 3, 4, 6, 8 ]
+      - NumLoadsCoalescedA: [ 1, -1 ]
+      - NumLoadsCoalescedB: [ 1, -1 ]
       - LoopUnroll: [8, 16]
     BenchmarkForkParameters:
     JoinParameters:
@@ -140,8 +140,8 @@ BenchmarkProblems:
       - WorkGroupShape: [ 0 ]
       - ThreadTileEdge: [2, 4, 6, 8]
       - ThreadTileShape: [ 0 ]
-      - NumLoadsCoalescedA: [ 1, 2, 3, 4, 6, 8 ]
-      - NumLoadsCoalescedB: [ 1, 2, 3, 4, 6, 8 ]
+      - NumLoadsCoalescedA: [ 1, -1 ]
+      - NumLoadsCoalescedB: [ 1, -1 ]
       - LoopUnroll: [8, 16]
     BenchmarkForkParameters:
     JoinParameters:
@@ -151,7 +151,6 @@ BenchmarkProblems:
       - ProblemSizes: [ [32, 32, 32, 4000], [32, 32, 32, 4000], [2], [1536] ]
 
 LibraryLogic:
-  Dilation: 3
-  Threshold: 0.1
+  BranchPenalty: 0
 
 LibraryClient:
diff --git a/Tensile/Configs/rocblas_sgemm.yaml b/Tensile/Configs/rocblas_sgemm.yaml
index 346bf1cb7..c4ed34214 100644
--- a/Tensile/Configs/rocblas_sgemm.yaml
+++ b/Tensile/Configs/rocblas_sgemm.yaml
@@ -41,8 +41,8 @@ BenchmarkProblems:
       - ThreadTileShape: [ 0 ]
       - WorkGroupEdge: [8, 16]
       - ThreadTileEdge: [2, 4, 6, 8]
-      - NumLoadsCoalescedA: [ 1, 2, 3, 4, 6, 8 ]
-      - NumLoadsCoalescedB: [ 1, 2, 3, 4, 6, 8 ]
+      - NumLoadsCoalescedA: [ 1, -1 ]
+      - NumLoadsCoalescedB: [ 1, -1 ]
       - LoopUnroll: [8, 16]
     BenchmarkForkParameters:
     JoinParameters:
@@ -74,8 +74,8 @@ BenchmarkProblems:
       - WorkGroupShape: [ 0 ]
       - ThreadTileEdge: [2, 4, 6, 8]
       - ThreadTileShape: [ 0 ]
-      - NumLoadsCoalescedA: [ 1, 2, 3, 4, 6, 8 ]
-      - NumLoadsCoalescedB: [ 1, 2, 3, 4, 6, 8 ]
+      - NumLoadsCoalescedA: [ 1, -1 ]
+      - NumLoadsCoalescedB: [ 1, -1 ]
       - LoopUnroll: [8, 16]
     BenchmarkForkParameters:
     JoinParameters:
@@ -107,8 +107,8 @@ BenchmarkProblems:
       - WorkGroupShape: [ 0 ]
       - ThreadTileEdge: [2, 4, 6, 8]
       - ThreadTileShape: [ 0, 1 ]
-      - NumLoadsCoalescedA: [ 1, 2, 3, 4, 6, 8 ]
-      - NumLoadsCoalescedB: [ 1, 2, 3, 4, 6, 8 ]
+      - NumLoadsCoalescedA: [ 1, -1 ]
+      - NumLoadsCoalescedB: [ 1, -1 ]
       - LoopUnroll: [8, 16]
     BenchmarkForkParameters:
     JoinParameters:
@@ -140,8 +140,8 @@ BenchmarkProblems:
       - WorkGroupShape: [ 0 ]
       - ThreadTileEdge: [2, 4, 6, 8]
       - ThreadTileShape: [ 0 ]
-      - NumLoadsCoalescedA: [ 1, 2, 3, 4, 6, 8 ]
-      - NumLoadsCoalescedB: [ 1, 2, 3, 4, 6, 8 ]
+      - NumLoadsCoalescedA: [ 1, -1 ]
+      - NumLoadsCoalescedB: [ 1, -1 ]
       - LoopUnroll: [8, 16]
     BenchmarkForkParameters:
     JoinParameters:
@@ -151,7 +151,6 @@ BenchmarkProblems:
       - ProblemSizes: [ [32, 32, 32, 4000], [32, 32, 32, 4000], [2], [2880] ]
 
 LibraryLogic:
-  Dilation: 3
-  Threshold: 0.1
+  BranchPenalty: 0
 
 LibraryClient:
diff --git a/Tensile/Configs/rocblas_zgemm.yaml b/Tensile/Configs/rocblas_zgemm.yaml
index badbd3dfb..4082ed6c8 100644
--- a/Tensile/Configs/rocblas_zgemm.yaml
+++ b/Tensile/Configs/rocblas_zgemm.yaml
@@ -41,8 +41,8 @@ BenchmarkProblems:
       - ThreadTileShape: [ 0 ]
       - WorkGroupEdge: [8, 16]
       - ThreadTileEdge: [2, 4, 6, 8]
-      - NumLoadsCoalescedA: [ 1, 2, 3, 4, 6, 8 ]
-      - NumLoadsCoalescedB: [ 1, 2, 3, 4, 6, 8 ]
+      - NumLoadsCoalescedA: [ 1, -1 ]
+      - NumLoadsCoalescedB: [ 1, -1 ]
       - LoopUnroll: [8, 16]
     BenchmarkForkParameters:
     JoinParameters:
@@ -74,8 +74,8 @@ BenchmarkProblems:
       - WorkGroupShape: [ 0 ]
       - ThreadTileEdge: [2, 4, 6, 8]
       - ThreadTileShape: [ 0 ]
-      - NumLoadsCoalescedA: [ 1, 2, 3, 4, 6, 8 ]
-      - NumLoadsCoalescedB: [ 1, 2, 3, 4, 6, 8 ]
+      - NumLoadsCoalescedA: [ 1, -1 ]
+      - NumLoadsCoalescedB: [ 1, -1 ]
       - LoopUnroll: [8, 16]
     BenchmarkForkParameters:
     JoinParameters:
@@ -107,8 +107,8 @@ BenchmarkProblems:
       - WorkGroupShape: [ 0 ]
       - ThreadTileEdge: [2, 4, 6, 8]
       - ThreadTileShape: [ 0 ]
-      - NumLoadsCoalescedA: [ 1, 2, 3, 4, 6, 8 ]
-      - NumLoadsCoalescedB: [ 1, 2, 3, 4, 6, 8 ]
+      - NumLoadsCoalescedA: [ 1, -1 ]
+      - NumLoadsCoalescedB: [ 1, -1 ]
       - LoopUnroll: [8, 16]
     BenchmarkForkParameters:
     JoinParameters:
@@ -140,8 +140,8 @@ BenchmarkProblems:
       - WorkGroupShape: [ 0 ]
       - ThreadTileEdge: [2, 4, 6, 8]
       - ThreadTileShape: [ 0 ]
-      - NumLoadsCoalescedA: [ 1, 2, 3, 4, 6, 8 ]
-      - NumLoadsCoalescedB: [ 1, 2, 3, 4, 6, 8 ]
+      - NumLoadsCoalescedA: [ 1, -1 ]
+      - NumLoadsCoalescedB: [ 1, -1 ]
       - LoopUnroll: [8, 16]
     BenchmarkForkParameters:
     JoinParameters:
@@ -151,7 +151,6 @@ BenchmarkProblems:
       - ProblemSizes: [ [32, 32, 32, 4000], [32, 32, 32, 4000], [2], [1536] ]
 
 LibraryLogic:
-  Dilation: 3
-  Threshold: 0.1
+  BranchPenalty: 0
 
 LibraryClient:
diff --git a/Tensile/Configs/sgemm.yaml b/Tensile/Configs/sgemm.yaml
index 49e6d13c7..f9ddad0c5 100644
--- a/Tensile/Configs/sgemm.yaml
+++ b/Tensile/Configs/sgemm.yaml
@@ -40,19 +40,15 @@ BenchmarkProblems:
     ForkParameters:
       - WorkGroupEdge: [8, 16]
       - ThreadTileEdge: [2, 4, 8 ]
-    BenchmarkForkParameters:
-      - NumLoadsCoalescedA: [ 1, 2, 3, 4, 6 ]
-      - NumLoadsCoalescedB: [ 1, 2, 3, 4, 6 ]
       - LoopUnroll: [8, 16]
+    BenchmarkForkParameters:
     JoinParameters:
       - MacroTile
-      - DepthU
     BenchmarkJoinParameters:
     BenchmarkFinalParameters:
       - ProblemSizes: [ [16, 128], [16, 128], [256] ]
 
 LibraryLogic:
-  Dilation: 3
-  Threshold: 0.1
+  BranchPenalty: 0
 
 LibraryClient:
diff --git a/Tensile/Configs/sgemm_5760.yaml b/Tensile/Configs/sgemm_5760.yaml
index d8f077b85..7c167a650 100644
--- a/Tensile/Configs/sgemm_5760.yaml
+++ b/Tensile/Configs/sgemm_5760.yaml
@@ -39,8 +39,6 @@ BenchmarkProblems:
       - ThreadTileShape: [0, 1]
       - WorkGroupEdge: [8, 16]
       - ThreadTileEdge: [ 8 ]
-      - NumLoadsCoalescedA: [ 1, 2, 3, 4, 6, 8 ]
-      - NumLoadsCoalescedB: [ 1, 2, 3, 4, 6, 8 ]
       - LoopUnroll: [8, 16]
     #BenchmarkForkParameters:
     JoinParameters:
@@ -71,8 +69,6 @@ BenchmarkProblems:
       - ThreadTileShape: [0, 1]
       - WorkGroupEdge: [8, 16]
       - ThreadTileEdge: [4, 6, 8]
-      - NumLoadsCoalescedA: [ 1, 2, 3, 4, 5, 6 ]
-      - NumLoadsCoalescedB: [ 1, 2, 3, 4, 5, 6 ]
       - LoopUnroll: [8, 16]
     #BenchmarkForkParameters:
     JoinParameters:
@@ -103,8 +99,6 @@ BenchmarkProblems:
       - ThreadTileShape: [0, 1]
       - WorkGroupEdge: [8, 16]
       - ThreadTileEdge: [4, 6, 8]
-      - NumLoadsCoalescedA: [ 1, 2, 3, 4, 5, 6 ]
-      - NumLoadsCoalescedB: [ 1, 2, 3, 4, 5, 6 ]
       - LoopUnroll: [8, 16]
     #BenchmarkForkParameters:
     JoinParameters:
@@ -135,8 +129,6 @@ BenchmarkProblems:
       - ThreadTileShape: [0, 1]
       - WorkGroupEdge: [8, 16]
       - ThreadTileEdge: [4, 6, 8]
-      - NumLoadsCoalescedA: [ 1, 2, 3, 4, 5, 6 ]
-      - NumLoadsCoalescedB: [ 1, 2, 3, 4, 5, 6 ]
       - LoopUnroll: [8, 16]
     #BenchmarkForkParameters:
     JoinParameters:
diff --git a/Tensile/Configs/tensor_contraction.yaml b/Tensile/Configs/tensor_contraction.yaml
index dd44124d7..aa5e779f4 100644
--- a/Tensile/Configs/tensor_contraction.yaml
+++ b/Tensile/Configs/tensor_contraction.yaml
@@ -40,10 +40,8 @@ BenchmarkProblems:
     ForkParameters:
       - WorkGroupEdge: [8, 16]
       - ThreadTileEdge: [2, 4, 8 ]
-    BenchmarkForkParameters:
-      - NumLoadsCoalescedA: [ 1, 2, 3, 4, 6 ]
-      - NumLoadsCoalescedB: [ 1, 2, 3, 4, 6 ]
       - LoopUnroll: [8, 16]
+    BenchmarkForkParameters:
     JoinParameters:
       - MacroTile
       - DepthU
@@ -52,7 +50,6 @@ BenchmarkProblems:
       - ProblemSizes: [ [16, 128], [16, 128], [2, 2, 4], [256] ]
 
 LibraryLogic:
-  Dilation: 3
-  Threshold: 0.1
+  BranchPenalty: 0
 
 LibraryClient:
diff --git a/Tensile/LibraryLogic.py b/Tensile/LibraryLogic.py
index 881631f2b..823d48452 100644
--- a/Tensile/LibraryLogic.py
+++ b/Tensile/LibraryLogic.py
@@ -2,6 +2,7 @@
 import os.path
 import array
 import csv
+from sys import stdout
 
 from copy import deepcopy
 
@@ -12,7 +13,7 @@
 ################################################################################
 # Analyze Problem Type
 ################################################################################
-def analyzeProblemType( problemTypeTuple, analysisParameters ):
+def analyzeProblemType( problemTypeTuple, inputParameters ):
   problemType = problemTypeTuple[0]
   problemSizes = problemTypeTuple[1]
   dataFileName = problemTypeTuple[2]
@@ -20,9 +21,7 @@ def analyzeProblemType( problemTypeTuple, analysisParameters ):
   print2(HR)
   print1("# %s" % problemType)
 
-  #print "#  %s" % dataFileName
-  #print "#  %s" % solutionsFileName
-
+  ######################################
   # Read Solutions
   (problemSizes, solutions) = YAMLIO.readSolutions(solutionsFileName)
   print2("# ProblemSizes: %s" % problemSizes)
@@ -30,165 +29,89 @@ def analyzeProblemType( problemTypeTuple, analysisParameters ):
   print2("# Solutions:")
   solutionIdx = 0
   for solution in solutions:
-    print2("#  (%u) %s" % (solutionIdx, Solution.getNameMin(solution, solutionMinNaming)))
+    print2("#  (%u) %s" % (solutionIdx, Solution.getNameMin(solution, \
+        solutionMinNaming)))
     solutionIdx += 1
   print2(HR)
 
+  ######################################
   # Read Data From CSV
-  #numProblemSizes = problemSizes.numProblemSizes
-  data = BenchmarkDataAnalyzer(problemType, problemSizes, solutions, \
-      analysisParameters)
-  data.populateFromCSV(dataFileName)
-
-  ##############################################################################
-  # Determine Solutions Along Diagonal
-  # roughly same splitting regardless of sizeU
-  problemIndices = []
-  for numProblemsForIndex in data.numProblemSizes:
-    problemIndices.append(numProblemsForIndex-1)
-  diagonalRules = data.getFastestSolutionsAlongDiagonal(problemIndices)
-  if True:
-    print2("Diagonal Rules:")
-    for rule in diagonalRules:
-      string = "  if freeSize >=%4u" % data.problemIndexToSize[0][rule[1][0]]
-      for i in range(1, data.numIndices):
-        string += "x%4u" % data.problemIndexToSize[i][rule[1][i]]
-      string += " return S[%u] @ %5.0f-%5.0f>%5.0f GFlops is %s" \
-          % (rule[0], rule[2], rule[3], rule[4], \
-          data.solutionNames[rule[0]])
-      print2(string)
-
-  ##############################################################################
-  # Determine Skinny0 Solutions
-  skinnyRules01 = data.getSkinnySolutions(diagonalRules, problemIndices, \
-      data.idx0, data.idx1)
-  #print "Skinny Rules:"
-  #for rule in skinnyRules01:
-  #  string = "  if freeSize >=%4u" % data.problemIndexToSize[0][rule[1][0]]
-  #  for i in range(1, data.numIndices):
-  #    string += "x%4u" % data.problemIndexToSize[i][rule[1][i]]
-  #  string += " return S[%u] @ %5.0f-%5.0f>%5.0f GFlops is %s" \
-  #      % (rule[0], rule[2], rule[3], rule[4], \
-  #      data.solutionNames[rule[0]])
-
-  ##############################################################################
-  # Determine Skinny1 Solutions
-  skinnyRules10 = data.getSkinnySolutions(diagonalRules, problemIndices, \
-      data.idx1, data.idx0)
-
-  # list solutions that actually get used
-  solutionIndicesUsed = []
-  for rule in skinnyRules01:
-    pass
-  for rule in skinnyRules10:
-    pass
-  for rule in diagonalRules:
-    solutionIdx = rule[0]
-    solution = solutions[solutionIdx]
-    MT0 = solution["MacroTile0"]
-    MT1 = solution["MacroTile1"]
-    DU = solution["DepthU"]
-    #print "Rule Tile S[%u]: %ux%ux%u" % (solutionIdx, MT0, MT1, DU)
-    # is this solution in the list
-    inList = False
-    for solutionUsed in solutionIndicesUsed:
-      if solutionUsed[0] == solutionIdx:
-        inList = True
-        break
-    if not inList:
-      insertIdx = len(solutionIndicesUsed)
-      for i in range(0, len(solutionIndicesUsed)):
-        iMT0 = solutionIndicesUsed[i][1]
-        iMT1 = solutionIndicesUsed[i][2]
-        iDU  = solutionIndicesUsed[i][3]
-        #print "  compare S[%u]: %ux%ux%u" % (solutionIndicesUsed[i][0], \
-        #    iMT0, iMT1, iDU)
-        if MT0*MT1 < iMT0*iMT1:
-          insertIdx = i
-          break
-        elif MT0*MT1 > iMT0*iMT1:
-          continue
-        else: # MT == MT
-          if DU < iDU:
-            insertIdx = i
-            break
-          else:
-            continue
-
-        # if i'm smaller than i, insert me before i
-      #print "insert: %u" % insertIdx
-      solutionIndicesUsed.insert(insertIdx, [solutionIdx, MT0, MT1, DU])
-  #print solutionIndicesUsed
-
-  # list of solutions used
-  solutionsUsed = []
-  for solutionIndexUsed in solutionIndicesUsed:
-    solutionsUsed.append(solutions[solutionIndexUsed[0]])
-
-  # translate rules to new solution indices
-  for rule in skinnyRules01:
-    pass
-  for rule in skinnyRules10:
-    pass
-  for ruleIdx in range(0, len(diagonalRules)):
-    solutionIdx = diagonalRules[ruleIdx][0]
-    for i in range(0, len(solutionIndicesUsed)):
-      solutionIndexUsed = solutionIndicesUsed[i]
-      if solutionIdx == solutionIndexUsed[0]:
-        diagonalRules[ruleIdx][0] = i
-        break
-    # change problemSizeIndices to sizes
-    for i in range(0, 3):
-      diagonalRules[ruleIdx][1][i] = \
-          data.problemIndexToSize[i][ diagonalRules[ruleIdx][1][i] ]
-
-  print2("# New Rules: %s" % diagonalRules)
-
+  logicAnalyzer = LogicAnalyzer( \
+      problemType, problemSizes, solutions, inputParameters)
+  logicAnalyzer.populateFromCSV(dataFileName)
+
+  ######################################
+  # Remove invalid solutions
+  logicAnalyzer.removeInvalidSolutions()
+
+  ######################################
+  # Remove least important solutions
+  logicAnalyzer.removeLeastImportantSolutions()
+
+  ######################################
+  # Correct outliers
+  if inputParameters["SmoothOutliers"]:
+    logicAnalyzer.smoothOutliers()
+  #logicAnalyzer.print2D([0, 0])
+
+  ######################################
+  # Create Rules
+  logic = logicAnalyzer.enRule(0, logicAnalyzer.globalIndexRange)
+  print2("# Final Logic:")
+  print2(logic)
+  logicComplexity = [0]*logicAnalyzer.numIndices
+  logicAnalyzer.scoreLogicComplexity(logic, logicComplexity)
+  print2("Logic Complexity: %s" % logicComplexity)
+  score = logicAnalyzer.scoreRangeForLogic( \
+      logicAnalyzer.globalIndexRange, logic)
+  print1("\nScore: %.0f ms" % (score/1000))
+
+  logicAnalyzer.prepareLogic(logic)
 
   #return (skinnyRules01, skinnyRules10, diagonalRules)
-  return (problemType, solutionsUsed, [], [], diagonalRules )
+  #return (problemType, logicAnalyzer.solutionsUsed, [], [], logicAnalyzer.diagonalRules )
+  return (problemType, logicAnalyzer.solutions, logicAnalyzer.indexOrder, logic)
 
 
 
 ################################################################################
-# BenchmarkDataAnalyzer
+# LogicAnalyzer
 ################################################################################
-class BenchmarkDataAnalyzer:
-
-  ########################################
-  # diagonal rule looks like
-  # 0: solutionIdx
-  # 1: problemIndices for minThreshold problem
-  # 2: gflops at above minSize
-  # 3: maxGFlops for this solution along diagonal in interval it won
-  # 4: gflops of prior winner at minSize, i.e., what performance did it beat
-
-  ########################################
-  # skinny rule looks like
-  # 0: solutionIdx
-  # 1: problemIndices for minThreshold problem
-  # 2: gflops at above minSize
+class LogicAnalyzer:
 
+  ##############################################################################
+  ##############################################################################
+  ###
+  ###  Entry / Top-Level Functions
+  ###
+  ##############################################################################
+  ##############################################################################
 
-  def __init__(self, problemType, problemSizes, solutions, analysisParameters):
+  ##############################################################################
+  # ENTRY: Init
+  ##############################################################################
+  def __init__(self, problemType, problemSizes, solutions, inputParameters):
     self.problemType = problemType
     self.problemSizes = problemSizes
-    self.analysisParameters = analysisParameters
+    self.parameters = inputParameters
     print2("ProblemSizes: %s" % self.problemSizes)
     # TODO verify that data is symmetric for diagonal
     #if self.problemSizes[self.problemType["Index0"]] \
     #    != self.problemSizes[self.problemType["Index1"]]:
     #  printExit("d0 / d1 must be symmetric for analysis.")
     self.numProblemSizes = problemSizes.numProblemSizes # native order
-    print2("NumProblemSizes: %s" % self.numProblemSizes)
+    print1("NumProblemSizes: %s" % self.numProblemSizes)
     self.numIndices = len(self.numProblemSizes)
     self.solutions = solutions
     self.numSolutions = len(self.solutions)
     self.solutionMinNaming = Solution.getMinNaming(solutions)
     self.solutionNames = []
+    self.solutionTiles = []
     for solution in self.solutions:
       self.solutionNames.append(Solution.getNameMin(solution, \
           self.solutionMinNaming))
+      self.solutionTiles.append("%ux%u"%(solution["MacroTile0"], solution["MacroTile1"]))
+    self.flopsPerMac = self.problemType["DataType"].flopsPerMac()
 
     # special indices
     self.idx0 = self.problemType["Index0"]
@@ -229,13 +152,23 @@ def __init__(self, problemType, problemSizes, solutions, analysisParameters):
         currentSize += currentStride
         currentStride += index[2]
         idx += 1
-    #print "S->I %s" % self.problemSizeToIndex
-    #print "I->S %s" % self.problemIndexToSize
+    self.rangeIndicesFree = range(0, self.problemType["NumIndicesC"])
+    self.rangeIndicesSummation = range(self.problemType["NumIndicesC"], \
+        self.problemType["TotalIndices"])
+    self.indexOrder = self.recommendedIndexOrder()
+    print2("IndexOrder: %s" % self.indexOrder)
+    self.globalIndexRange = []
+    for i in range(0, self.numIndices):
+      self.globalIndexRange.append([0, self.numProblemSizes[i]])
+    self.problemIndicesForGlobalRange \
+        = self.problemIndicesForRange(self.globalIndexRange)
+    self.tab = [""]*self.numIndices
 
 
 
   ##############################################################################
-  # Read In CSV
+  # ENTRY: Read In CSV
+  ##############################################################################
   def populateFromCSV(self, dataFileName):
 
     # open file
@@ -271,10 +204,6 @@ def populateFromCSV(self, dataFileName):
         for i in range(0, self.numIndices):
           problemIndices.append(self.problemSizeToIndex[i][problemSize[i]])
         serialIdx = self.indicesToSerial(0, problemIndices)
-        #print "%s -> %s -> %u" % (problemSize, problemIndices, serialIdx)
-
-        # total size
-        #totalFlops = float(row[totalSizeIdx])
 
         # data
         solutionIdx = 0
@@ -288,295 +217,707 @@ def populateFromCSV(self, dataFileName):
 
 
   ##############################################################################
-  # Get Fastest Solutions Along Diagonal (d0=d1) for largest sizes
+  # ENTRY: Remove Invalid Solutions
+  ##############################################################################
+  def removeInvalidSolutions(self):
+    #problemIndices = [0]*self.numIndices
+    allSolutionValid = False
+    while not allSolutionValid:
+      moreProblems = True
+      invalidIdx = -1
+      for problemIndices in self.problemIndicesForGlobalRange:
+        problemSerial = self.indicesToSerial(0, problemIndices)
+        for solutionIdx in range(0, self.numSolutions):
+          gflops = self.data[problemSerial+solutionIdx]
+          if gflops == 0:
+            invalidIdx = solutionIdx
+            break
+      if invalidIdx >= 0:
+        print1("# Removing Invalid Solution: %u %s" \
+            % (invalidIdx, self.solutionNames[invalidIdx]) )
+        self.removeSolution(invalidIdx)
+      else:
+        allSolutionValid = True
+
+
+  ##############################################################################
+  # ENTRY: Remove Least Important Solutions
+  ##############################################################################
+  def removeLeastImportantSolutions(self):
+    # Remove least important solutions
+    while True:
+      (lisIdx, lisPercSaved, lisPercWins, lisPercExec) \
+          = self.leastImportantSolution()
+      if lisPercSaved < self.parameters["SolutionImportanceMin"]:
+        self.removeSolution(lisIdx)
+        continue
+      else:
+        break
+
+
+  ##############################################################################
+  # ENTRY: Smooth Outliers
   ##############################################################################
-  def getFastestSolutionsAlongDiagonal(self, problemIndices):
-    print2("\nFastest Diagonal idxU: %u" % problemIndices[self.idxU])
-    # abstract to multidimensions
-    # what is the diagonal
-    dilation = self.analysisParameters["Dilation"]
-    threshold = self.analysisParameters["Threshold"]
-    numProblems0 = self.numProblemSizes[self.idx0]
+  def smoothOutliers(self):
+    problemSizes = [0]*self.numIndices
+    for problemIndices in self.problemIndicesForGlobalRange:
+      problemSerial = self.indicesToSerial(0, problemIndices)
 
-    ############################################################################
-    # determine winner at largest size
-    solutionNumWins = [0]*self.numSolutions
-    solutionGFlops = [0]*self.numSolutions
-    for problemSizeIdx in range(max(0,numProblems0-dilation*2), numProblems0):
-      problemIndices[self.idx0] = problemSizeIdx
-      problemIndices[self.idx1] = problemSizeIdx
-      problemIdx = self.indicesToSerial(0, problemIndices)
-      winnerIdx = -1
-      winnerGFlops = -1
       for solutionIdx in range(0, self.numSolutions):
-        solutionSerialIdx = problemIdx + solutionIdx
-        solutionTmpGFlops = self.data[solutionSerialIdx]
-        if solutionTmpGFlops > winnerGFlops:
-          winnerIdx = solutionIdx
-          winnerGFlops = solutionTmpGFlops
-        #print "updated winner: ", winnerIdx
-      #print winnerIdx
-      solutionNumWins[winnerIdx] += 1
-      if winnerGFlops > solutionGFlops[winnerIdx]:
-        solutionGFlops[winnerIdx] = winnerGFlops
-    largestWinnerIdx = -1
-    largestWinnerNumWins = -1
-    largestWinnerGFlops = -1
-    #print "FastestWins:"
+        gflops = self.data[problemSerial+solutionIdx]
+        neighborGFlops = []
+        smoothProblem = False
+        for iIdx in range(0, self.numIndices):
+          if problemIndices[iIdx] > 0 \
+              and problemIndices[iIdx] < self.numProblemSizes[iIdx]-1:
+            neighborBeforeIndices = deepcopy(problemIndices)
+            neighborAfterIndices = deepcopy(problemIndices)
+            neighborBeforeIndices[iIdx] -= 1
+            neighborAfterIndices[iIdx] += 1
+            neighborBeforeIdx = self.indicesToSerial(0, neighborBeforeIndices)
+            neighborAfterIdx = self.indicesToSerial(0, neighborAfterIndices)
+            neighborBeforeGFlops = self.data[neighborBeforeIdx+solutionIdx]
+            neighborAfterGFlops = self.data[neighborAfterIdx+solutionIdx]
+            neighborGFlops.append(neighborBeforeGFlops)
+            neighborGFlops.append(neighborAfterGFlops)
+            if neighborBeforeGFlops > gflops \
+                and neighborAfterGFlops < gflops :
+              smoothProblem = True
+        if smoothProblem:
+          s = ""
+          for i in range(0, self.numIndices):
+            problemSizes[i] = self.problemIndexToSize[i][problemIndices[i]]
+            s += "%u, " % problemSizes[i]
+          new = sum(neighborGFlops)/len(neighborGFlops)
+          old = self.data[problemSerial+solutionIdx]
+          s += "%f -> %f" % (old, new)
+          self.data[problemSerial+solutionIdx] \
+              = sum(neighborGFlops)/len(neighborGFlops)
+
+
+  ##############################################################################
+  # ENTRY: En Rule
+  # currentIndexIndex = 0, 1, 2, 3...
+  # currentIndexRange will have only 1 size for prior indices (unless initial)
+  #
+  # Rule:
+  # [128, [
+  #         [64, [
+  #                [16, 0],
+  #                [2880,1]
+  #              ]
+  #         ],
+  #         [96, [
+  #                [16, 0],
+  #                [64, 1]
+  #              ]
+  #         ]
+  #       ]
+  # ], another
+  #
+  #
+  ##############################################################################
+  def enRule(self, currentIndexIndex, currentIndexRange):
+    cii = currentIndexIndex
+    if currentIndexIndex == 0:
+      self.tab[cii] = "[] "
+    elif currentIndexIndex == 1:
+      self.tab[cii] = "[%2u] " % ( \
+          currentIndexRange[self.indexOrder[0]][0])
+    elif currentIndexIndex == 2:
+      self.tab[cii] = "[%2u,%2u] " % ( \
+          currentIndexRange[self.indexOrder[0]][0], \
+          currentIndexRange[self.indexOrder[1]][0])
+    elif currentIndexIndex == 3:
+      self.tab[cii] = "[%2u,%2u,%2u] " % ( \
+          currentIndexRange[self.indexOrder[0]][0], \
+          currentIndexRange[self.indexOrder[1]][0], \
+          currentIndexRange[self.indexOrder[2]][0])
+    elif currentIndexIndex == 4:
+      self.tab[cii] = "[%2u,%2u,%2u,%2u] " % ( \
+          currentIndexRange[self.indexOrder[0]][0], \
+          currentIndexRange[self.indexOrder[1]][0], \
+          currentIndexRange[self.indexOrder[2]][0], \
+          currentIndexRange[self.indexOrder[3]][0])
+    tab = self.tab[cii]
+    if globalParameters["PrintLevel"] == 1:
+      stdout.write("\n%s"%tab)
+    currentIndex = self.indexOrder[currentIndexIndex]
+    print2("%senRule(%s)" % (tab, currentIndexRange))
+    nextIndexIndex = currentIndexIndex+1
+    nextIndexRange = deepcopy(currentIndexRange)
+    isLastIndex = currentIndexIndex == self.numIndices-1
+    ruleList = []
+
+    ########################################
+    # if there's only 1 problem size here
+    ########################################
+    if currentIndexRange[currentIndex][1] \
+        - currentIndexRange[currentIndex][0] == 1:
+
+      ########################################
+      # this is last index, so just return fastest solution
+      if isLastIndex:
+        # TODO optimize b/c this should be only single problem
+        #scores = self.scoreRangeForSolutions(currentIndexRange)
+        #winnerIdx = 0
+        #for solutionIdx in range(1, self.numSolution):
+        #  if scores[solutionIdx] < scores[winnerIdx]:
+        #    winnerIdx = solutionIdx
+        winnerIdx = self.winnerForRange(currentIndexRange)
+        #print2("%sreturning early winner=%u" % (tab, winnerIdx))
+        ruleList.append(-1)
+        ruleList.append(winnerIdx)
+        if globalParameters["PrintLevel"] == 1:
+          stdout.write("#")
+
+      ########################################
+      # this isn't last index, so just recursively return next index
+      else:
+        #print2("%sreturning early enRule(%s)" \
+        #    % (tab, nextIndexRange) )
+        rule = [ -1, self.enRule(nextIndexIndex, nextIndexRange) ]
+        ruleList.append(rule)
+        if globalParameters["PrintLevel"] == 1:
+          stdout.write("#")
+
+    ########################################
+    # full iterative rule list
+    ########################################
+    else:
+
+      ########################################
+      # create initial rule
+      initialSize = min(currentIndexRange[currentIndex][0] \
+          + self.parameters["InitialSolutionWindow"], \
+          self.numProblemSizes[currentIndex])
+      nextIndexRange[currentIndex][1] = initialSize
+      if isLastIndex:
+        winnerIdx = self.winnerForRange(nextIndexRange)
+        initialRule = [ currentIndexRange[currentIndex][0], winnerIdx]
+      else:
+        #print2("%sinitialRule(%s)" % (tab, nextIndexRange))
+        initialRule = [ currentIndexRange[currentIndex][0], \
+            self.enRule(nextIndexIndex, nextIndexRange) ]
+        #print2("%sinitialRule(%s) DONE" % (tab, nextIndexRange))
+      ruleList.append(initialRule)
+      if globalParameters["PrintLevel"] == 1:
+        stdout.write("#")
+
+      ########################################
+      # for all problem indices in this index
+      for problemIndex in range(currentIndexRange[currentIndex][0]+1, \
+          currentIndexRange[currentIndex][1]):
+        nextIndexRange[currentIndex][0] = problemIndex
+        nextIndexRange[currentIndex][1] = problemIndex+1
+        priorRule = ruleList[len(ruleList)-1]
+        priorRuleForSize = deepcopy(priorRule)
+        priorRuleForSize[0] = problemIndex
+
+        if isLastIndex:
+          winnerIdx = self.winnerForRange(nextIndexRange)
+          candidateRule = [ problemIndex, winnerIdx]
+        else:
+          candidateRule = [ problemIndex, self.enRule(nextIndexIndex, \
+              nextIndexRange) ]
+
+        ########################################
+        # candidate same as prior
+        if candidateRule[1] == priorRule[1]:
+          #print2("%sP[%2u]: same" % (tab, problemIndex))
+          ruleList[len(ruleList)-1][0] = problemIndex
+          if globalParameters["PrintLevel"] == 1:
+            stdout.write(" ")
+          continue
+
+        ########################################
+        # compare candidate vs prior
+        else:
+          #print2("%sScoring P:%s for Prior=%s, Cand=%s" \
+          #    % ( tab, nextIndexRange, priorRuleForSize, candidateRule))
+          # score prior
+          priorRuleScore = self.scoreRangeForLogic(nextIndexRange, \
+              [priorRuleForSize])
+          logicComplexity = [0]*self.numIndices
+          self.scoreLogicComplexity( \
+              [priorRuleForSize], logicComplexity)
+          priorRuleScore += self.parameters["BranchPenalty"] \
+              * sum(logicComplexity)
+          # score candidate
+          candidateRuleScore = self.scoreRangeForLogic(nextIndexRange, \
+              [candidateRule])
+          logicComplexity = [0]*self.numIndices
+          self.scoreLogicComplexity( \
+              [candidateRule], logicComplexity)
+          candidateRuleScore += self.parameters["BranchPenalty"] \
+              * sum(logicComplexity)
+          candidateRuleScore += self.parameters["BranchPenalty"] # penalize
+          candidateFaster = candidateRuleScore < priorRuleScore
+          print2("%sP[%2u]: %s %s~%.0fus < %s~%.0fus" % (tab, problemIndex, \
+              "wins" if candidateFaster else "same", \
+              candidateRule, candidateRuleScore, priorRuleForSize, \
+              priorRuleScore ))
+
+          ########################################
+          # candidate wins
+          if candidateRuleScore < priorRuleScore:
+            ruleList.append(candidateRule)
+            if globalParameters["PrintLevel"] == 1:
+              stdout.write("#")
+
+          ########################################
+          # prior wins
+          else:
+            if globalParameters["PrintLevel"] == 1:
+              stdout.write(".")
+            ruleList[len(ruleList)-1][0] = problemIndex
+
+    #print2("%sReturning RuleList: %s" % (tab, ruleList))
+    return ruleList
+
+
+
+  ##############################################################################
+  ##############################################################################
+  ###
+  ###  Mid-Level Functions
+  ###
+  ##############################################################################
+  ##############################################################################
+
+
+
+  ##############################################################################
+  # Prepare Logic
+  # convert threshold indices to sizes
+  # last threshold = -1
+  ##############################################################################
+  def prepareLogic(self, logic):
+    depth = self.getLogicDepth(logic)
+    if depth == 0: return
+    indexIndex = self.numIndices - depth
+    index = self.indexOrder[indexIndex]
+    for i in range(0, len(logic)):
+      if i == len(logic)-1:
+        logic[i][0] = -1
+      else:
+        logic[i][0] = self.problemIndexToSize[index][logic[i][0]]
+      self.prepareLogic(logic[i][1])
+
+
+  ##############################################################################
+  # Print2D
+  ##############################################################################
+  def print2D(self, indices ):
+    indicesIdx = 0
+    problemIndices = []
+    for i in range(0, self.numIndices):
+      if i == self.idx0:
+        problemIndices.append(-1)
+      elif i == self.idx1:
+        problemIndices.append(-1)
+      else:
+        problemIndices.append(indices[indicesIdx])
+        indicesIdx += 1
+
+    winnerIndices = []
+    w = "winner"
+    g = "gflops"
+    f = "faster"
+    s = "second"
+    sss = []
+    for sIdx in range(0, self.numSolutions):
+      sss.append("Sol[%u]" % sIdx)
+    for j in range(0, self.numProblemSizes[1]):
+      w += ",%4u" % self.problemIndexToSize[1][j]
+      g += ",%4u" % self.problemIndexToSize[1][j]
+      f += ",%4u" % self.problemIndexToSize[1][j]
+      s += ",%4u" % self.problemIndexToSize[1][j]
+      for sIdx in range(0, self.numSolutions):
+        sss[sIdx] += ",%4u" % self.problemIndexToSize[1][j]
+    w += "\n"
+    g += "\n"
+    f += "\n"
+    s += "\n"
+    for sIdx in range(0, self.numSolutions):
+      sss[sIdx] += "\n"
+    for i in range(0, self.numProblemSizes[0]):
+      problemIndices[self.idx0] = i
+      w += "%4u" % self.problemIndexToSize[0][i]
+      g += "%4u" % self.problemIndexToSize[0][i]
+      f += "%4u" % self.problemIndexToSize[0][i]
+      s += "%4u" % self.problemIndexToSize[0][i]
+      for sIdx in range(0, self.numSolutions):
+        sss[sIdx] += "%4u" % self.problemIndexToSize[0][i]
+      for j in range(0, self.numProblemSizes[1]):
+        problemIndices[self.idx1] = j
+        problemSerial = self.indicesToSerial(0, problemIndices)
+        for sIdx in range(0, self.numSolutions):
+          sss[sIdx] += ",%f" % self.data[problemSerial+sIdx]
+
+        if self.data[problemSerial+0] > self.data[problemSerial+1]:
+          winnerIdx = 0
+          winnerGFlops = self.data[problemSerial+0]
+          secondIdx = 1
+          secondGFlops = self.data[problemSerial+1]
+        else:
+          winnerIdx = 1
+          winnerGFlops = self.data[problemSerial+1]
+          secondIdx = 0
+          secondGFlops = self.data[problemSerial+0]
+        for solutionIdx in range(2, self.numSolutions):
+          solutionSerialIdx = problemSerial + solutionIdx
+          solutionGFlops = self.data[solutionSerialIdx]
+          if solutionGFlops > winnerGFlops:
+            secondIdx = winnerIdx
+            secondGFlops = winnerGFlops
+            winnerIdx = solutionIdx
+            winnerGFlops = solutionGFlops
+
+
+        if winnerIdx not in winnerIndices:
+          winnerIndices.append(winnerIdx)
+        w += ",%4u" % winnerIdx
+        g += ",%f" % winnerGFlops
+        f += ",%f" % (winnerGFlops/secondGFlops)
+        s += ",%4u" % (secondIdx)
+      w += "\n"
+      g += "\n"
+      f += "\n"
+      s += "\n"
+      for sIdx in range(0, self.numSolutions):
+        sss[sIdx] += "\n"
+
+    w += "\n\n"
+    g += "\n\n"
+    f += "\n\n"
+    s += "\n\n"
+    for sIdx in range(0, self.numSolutions):
+      sss[sIdx] += "\n\n"
+    w += "Winners:\n"
+    for winnerIdx in winnerIndices:
+      w += "%4u, %s, %s\n" % (winnerIdx, self.solutionTiles[winnerIdx], self.solutionNames[winnerIdx])
+
+    printFileName = "Winner2D"
+    for idx in indices:
+      printFileName += "_%u" % idx
+    printFileName += ".csv"
+    printFile = open(os.path.join(globalParameters["WorkingPath"], printFileName), "w")
+    printFile.write( w )
+    printFile.write( g )
+    printFile.write( f )
+    printFile.write( s )
+    for sIdx in range(0, self.numSolutions):
+      printFile.write( sss[sIdx] )
+    printFile.close()
+
+
+  ##############################################################################
+  # Least Important Solution
+  ##############################################################################
+  def leastImportantSolution(self):
+    solutionImportance = []
     for i in range(0, self.numSolutions):
-      #print "sol[%u] = %u wins @ %.0f GFlops" \
-      #    % (i, solutionNumWins[i], solutionGFlops[i])
-      if solutionNumWins[i] > largestWinnerNumWins:
-        largestWinnerIdx = i
-        largestWinnerNumWins = solutionNumWins[i]
-        largestWinnerGFlops = solutionGFlops[i]
-    #print "Winner at Largest Problem: S[%u] @ %.0f GFlops with %u/%u wins" \
-    #    % (largestWinnerIdx, largestWinnerGFlops, largestWinnerNumWins, \
-    #    dilation*2)
-    problemIndices[self.idx0] = numProblems0-1
-    problemIndices[self.idx1] = numProblems0-1
-    largestWinnerAtLargestProblemIdx = self.indicesToSerial(largestWinnerIdx, \
-        problemIndices)
-    largestWinnerGFlopsAtLargestSize = \
-        self.data[largestWinnerAtLargestProblemIdx]
-
-    ############################################################################
-    # Diagonal Rule
-    # solutionIdx, minSizeThresholdIdx, gflops at minSize, maxGFlops, oldGFlops
-    numRules = 1
-    diagonalRules = [ [largestWinnerIdx, deepcopy(problemIndices), \
-        largestWinnerGFlopsAtLargestSize, largestWinnerGFlops, -1] ]
-
-    ############################################################################
-    # For largest to smallest, determine fastest solution
-    for problemSizeIdx in range(numProblems0-2, -1, -1):
-      problemIndices[self.idx0] = problemSizeIdx
-      problemIndices[self.idx1] = problemSizeIdx
-      problemIdx = self.indicesToSerial(0, problemIndices)
-
-      # current rule winner performance at this problemSizeIdx
-      ruleWinnerIdx = diagonalRules[-1][0]
-      ruleWinnerGFlopsForSize = self.data[problemIdx + ruleWinnerIdx]
-
-      #determine fastest at this problemSizeIdx
-      (winnerForSizeIdx, winnerForSizeGFlops) = \
-          self.getWinnerForProblem( problemIndices )
-
-      # ruleWinner also wins at this problem size (at least by threshold)
-      if winnerForSizeIdx == ruleWinnerIdx \
-          or ruleWinnerGFlopsForSize > (1-threshold)*winnerForSizeGFlops:
-        # just update rule
-        diagonalRules[numRules-1][1] = deepcopy(problemIndices)
-        diagonalRules[numRules-1][2] = ruleWinnerGFlopsForSize
-        diagonalRules[numRules-1][3] = max(diagonalRules[numRules-1][3], \
-            ruleWinnerGFlopsForSize)
-
-      # we have a new candidate winner
-      # only keep it if don't revert back to ruleWinner over next Dilation
+      solutionImportance.append([i, 0, 0, 0])
+    problemSizes = [0]*self.numIndices
+    totalSavedMs = 0
+    totalExecMs = 0
+    totalWins = 0
+    for problemIndices in self.problemIndicesForGlobalRange:
+      for i in range(0, self.numIndices):
+        problemSizes[i] = self.problemIndexToSize[i][problemIndices[i]]
+      totalFlops = self.flopsPerMac
+      for size in problemSizes:
+        totalFlops *= size
+
+      problemSerial = self.indicesToSerial(0, problemIndices)
+      if self.data[problemSerial+0] > self.data[problemSerial+1]:
+        winnerIdx = 0
+        winnerGFlops = self.data[problemSerial+0]
+        secondIdx = 1
+        secondGFlops = self.data[problemSerial+1]
       else:
+        winnerIdx = 1
+        winnerGFlops = self.data[problemSerial+1]
+        secondIdx = 0
+        secondGFlops = self.data[problemSerial+0]
+
+      for solutionIdx in range(2, self.numSolutions):
+        solutionSerialIdx = problemSerial + solutionIdx
+        solutionGFlops = self.data[solutionSerialIdx]
+        if solutionGFlops > winnerGFlops:
+          secondIdx = winnerIdx
+          secondGFlops = winnerGFlops
+          winnerIdx = solutionIdx
+          winnerGFlops = solutionGFlops
+      winnerTimeMs = totalFlops / winnerGFlops / 1000000
+      secondTimeMs = totalFlops / secondGFlops / 1000000
+      solutionImportance[winnerIdx][1] += (secondTimeMs - winnerTimeMs)
+      solutionImportance[winnerIdx][2] += 1
+      solutionImportance[winnerIdx][3] += winnerTimeMs
+
+      totalSavedMs += secondTimeMs - winnerTimeMs
+      totalExecMs += winnerTimeMs
+      totalWins += 1
+    solutionImportance.sort(key=lambda x: x[1])
+    return ( solutionImportance[0][0], \
+        solutionImportance[0][1] / totalSavedMs, \
+        solutionImportance[0][2] / totalWins, \
+        solutionImportance[0][3] / totalExecMs )
 
-        # check if we don't revert back to ruleWinner over next Dilation probs
-        revert = False
-        endDilationIdx = max(-1, problemSizeIdx-dilation)
-        for dilationSizeIdx in range(problemSizeIdx-1, \
-            endDilationIdx, -1):
-          problemIndices[self.idx0] = dilationSizeIdx
-          problemIndices[self.idx1] = dilationSizeIdx
-          dilationIdx = self.indicesToSerial(0, problemIndices)
-          ruleWinnerGFlopsForDilation = self.data[dilationIdx \
-              + ruleWinnerIdx]
-          #determine fastest at this problemSizeIdx
-          (winnerForDilationIdx, winnerForDilationGFlops) = \
-              self.getWinnerForProblem(problemIndices)
-
-          # ruleWinner also wins at dilation size (at least by threshold)
-          if winnerForDilationIdx == ruleWinnerIdx \
-              or ruleWinnerGFlopsForDilation \
-              > (1-threshold)*winnerForSizeGFlops:
-            # yes, within Dilation, we've returned to same winner
-            revert = True
-            # so update rule for this size
-            diagonalRules[numRules-1][1] = deepcopy(problemIndices)
-            diagonalRules[numRules-1][2] = winnerForDilationGFlops
-            diagonalRules[numRules-1][3] = max(diagonalRules[numRules-1][3], \
-                winnerForSizeGFlops)
-            # resume outer loop after dilation
-            problemSizeIdx = dilationSizeIdx
+
+  ##############################################################################
+  # Remove Solution
+  ##############################################################################
+  def removeSolution(self, removeSolutionIdx):
+
+    # temporarily move current to old
+    oldSolutions = self.solutions
+    oldNumSolutions = self.numSolutions
+    oldData = self.data
+    oldTotalSize = self.totalSize
+
+    # update solutions
+    self.solutions = []
+    for i in range(0, oldNumSolutions):
+      if i != removeSolutionIdx:
+        self.solutions.append(oldSolutions[i])
+    self.solutionMinNaming = Solution.getMinNaming(self.solutions)
+    self.solutionNames = []
+    self.solutionTiles = []
+    for solution in self.solutions:
+      self.solutionNames.append(Solution.getNameMin(solution, \
+          self.solutionMinNaming))
+      self.solutionTiles.append("%ux%u"%(solution["MacroTile0"], \
+          solution["MacroTile1"]))
+    self.numSolutions = len(self.solutions)
+
+    # update data
+    self.totalSize = self.totalProblems * self.numSolutions
+    self.data = array.array('f', [0]*self.totalSize)
+    for problemIndex in range(0, self.totalProblems):
+      newSolutionIdx = 0
+      for oldSolutionIdx in range(0, oldNumSolutions):
+        if oldSolutionIdx != removeSolutionIdx:
+          self.data[problemIndex*self.numSolutions+newSolutionIdx] \
+              = oldData[problemIndex*oldNumSolutions+oldSolutionIdx]
+          newSolutionIdx += 1
+
+
+  ##############################################################################
+  # Score Range For Logic
+  ##############################################################################
+  def scoreRangeForLogic(self, indexRange, logic):
+    depth = self.getLogicDepth(logic)
+    depth = self.numIndices - depth
+    fullLogic = deepcopy(logic)
+    for i in range(0, depth):
+      fullLogic = [[-1, fullLogic]]
+    fullLogic = fullLogic
+    return self.scoreRangeForFullLogic(depth, indexRange, fullLogic)
+
+  ##############################################################################
+  # Score Range For Full Logic
+  ##############################################################################
+  def scoreRangeForFullLogic(self, depth, indexRange, logic):
+    score = 0
+    for problemIndices in self.problemIndicesForRange(indexRange):
+      problemSerial = self.indicesToSerial(0, problemIndices)
+      totalFlops = self.totalFlopsForProblemIndices(problemIndices)
+      solutionIdx = self.getSolutionForProblemIndicesUsingLogic( \
+          problemIndices, logic)
+      gflops = self.data[problemSerial + solutionIdx]
+      timeUs = totalFlops / gflops / 1000
+      score += timeUs
+    return score
+
+  ##############################################################################
+  # Get Solution For Problem Indices Using Logic
+  ##############################################################################
+  def getSolutionForProblemIndicesUsingLogic(self, problemIndices, logic):
+    currentProblemIndices = self.toIndexOrder(problemIndices)
+    currentLogic = logic
+    for i in range(0, self.numIndices):
+      currentSizeIndex = currentProblemIndices[0]
+      for j in range(0, len(currentLogic)):
+        if currentLogic[j][0] < 0:
+          currentProblemIndices = currentProblemIndices[1:]
+          currentLogic = currentLogic[j][1]
+          break
+        if currentLogic[j][0] >= 0:
+          if currentSizeIndex <= currentLogic[j][0]:
+            currentProblemIndices = currentProblemIndices[1:]
+            currentLogic = currentLogic[j][1]
             break
-          else:
-            # different winner at this dilation size
-            # don't need to do anything
-            pass
-
-        # if we never revert to rule during dilation, create new rule
-        if not revert:
-          # solutionIdx, minSizeThresholdIdx, gflops at minSize, maxGFlops, old
-          newRule = [ winnerForSizeIdx, deepcopy(problemIndices), \
-              winnerForSizeGFlops, winnerForSizeGFlops, ruleWinnerGFlopsForSize]
-          diagonalRules.append(newRule)
-          numRules += 1
-          #print "Added new rule: %s" % newRule
-
-    return diagonalRules
-    #end diagonal rules
-
-
-  ##############################################################################
-  # Skinny Solutions
-  ##############################################################################
-  def getSkinnySolutions(self, diagonalRules, problemIndices, \
-      idxLarge, idxSmall):
-    idx0 = self.idx0
-    idx1 = self.idx1
-    #idxU = self.idxU
-    #dilation = self.analysisParameters["Dilation"]
-    threshold = self.analysisParameters["Threshold"]
-
-    skinnyRules = []
-
-    # for each size threshold along diagonal
-    for diagonalRuleIdx in range(0, len(diagonalRules)):
-      diagonalRule = diagonalRules[diagonalRuleIdx]
-      diagonalRuleWinnerIdx = diagonalRule[0]
-      diagonalRuleThresholdProblem = diagonalRule[1]
-      #diagonalRuleGFlops = diagonalRule[2] # perf at threshold
-      thresholdSizeFree = self.getSizeFree(diagonalRuleThresholdProblem)
-      print2("ThresholdSizeFree[%u][%u]: %u" \
-          % (diagonalRuleThresholdProblem[idx0], \
-          diagonalRuleThresholdProblem[idx1], \
-          thresholdSizeFree))
-
-      # check skinny d0<<d1 (large d0, small d1)
-      skinnyProblemIndices = deepcopy(problemIndices)
-      for sizeIdxSmall in range( diagonalRuleThresholdProblem[idxSmall]-1, -1, -1):
-        skinnyProblemIndices[idxSmall] = sizeIdxSmall
-        for sizeIdxLarge in range( diagonalRuleThresholdProblem[idxLarge], \
-            self.numProblemSizes[idxLarge]):
-          skinnyProblemIndices[idxLarge] = sizeIdxLarge
-
-
-          skinnySizeFree = self.getSizeFree(skinnyProblemIndices)
-          if skinnySizeFree > thresholdSizeFree:
-            #print "SkinnySizeFree[%u][%u]: %u" % (sizeIdxSmall, sizeIdxLarge, \
-            #  skinnySizeFree)
-
-            # rule winner's performance at this skinnyness
-            skinnyProblemIdx = self.indicesToSerial(0, skinnyProblemIndices)
-            diagonalWinnerGFlopsForSkinny = self.data[skinnyProblemIdx \
-                + diagonalRuleWinnerIdx]
-
-            # which solution wins here?
-            (winnerIdx, winnerGFlops) = \
-                self.getWinnerForProblem(skinnyProblemIndices)
-            #print winnerIdx, winnerGFlops
-            if winnerIdx == diagonalRuleWinnerIdx \
-                or diagonalWinnerGFlopsForSkinny > (1-threshold)*winnerGFlops:
-              # diagonal rule also wins here
-              print2("if dS <%5u and dL >%5u diagnl S[%2u] %5.0f == S[%2u] %5.0f GFlops" \
-                  % (self.problemIndexToSize[idxSmall][sizeIdxSmall], \
-                  self.problemIndexToSize[idxLarge][sizeIdxLarge], \
-                  winnerIdx, winnerGFlops, diagonalRuleWinnerIdx, \
-                  diagonalWinnerGFlopsForSkinny ))
-              pass
-            else:
-              # we're so skinny that diagonal rule no longer applies
-              print2("if dS <%5u and dL >%5u skinny S[%2u] %5.0f >> S[%2u] %5.0f GFlops" \
-                  % (self.problemIndexToSize[idxSmall][sizeIdxSmall], \
-                  self.problemIndexToSize[idxLarge][sizeIdxLarge], \
-                  winnerIdx, winnerGFlops, diagonalRuleWinnerIdx, \
-                  diagonalWinnerGFlopsForSkinny ))
-              skinnyRule = [deepcopy(skinnyProblemIndices), winnerIdx, \
-                  winnerGFlops]
-              skinnyRules.append(skinnyRule)
-              # TODO need to use dilate parameter to make sure we've switched
-              # TODO data along this size may not agree with
-              #   data along different sizes (but perhaps it should
-              # TODO need extra loop here, to iterate idxSmall to
-              # smaller sizes to see if the solution changes further
-
-            # does the diagonalRuleWinner also win here?
-            break # only check the problem size closest to ruleSize
-
-    return skinnyRules
-    # end skinny solutions
-
-  ##############################################################################
-  # Get Size Free and Summation
+    return currentLogic
+
+
+  ##############################################################################
+  ##############################################################################
+  ###
+  ###  Helper / Low-Level Functions
+  ###
+  ##############################################################################
+  ##############################################################################
+
+
   ##############################################################################
+  # Get Winner For Problem
   def getWinnerForProblem(self, problemIndices):
-    problemIdx = self.indicesToSerial(0, problemIndices)
+    problemSerial = self.indicesToSerial(0, problemIndices)
     winnerIdx = -1
     winnerGFlops = -1
     for solutionIdx in range(0, self.numSolutions):
-      solutionSerialIdx = problemIdx + solutionIdx
+      solutionSerialIdx = problemSerial + solutionIdx
       solutionGFlops = self.data[solutionSerialIdx]
       if solutionGFlops > winnerGFlops:
-        #print "%f > %f" % (solutionGFlops, winnerGFlops)
         winnerIdx = solutionIdx
         winnerGFlops = solutionGFlops
     return (winnerIdx, winnerGFlops)
 
 
   ##############################################################################
-  # Get Size Free and Summation
+  # Winner For Range
+  def winnerForRange(self, indexRange):
+    scores = self.scoreRangeForSolutions(indexRange)
+    winnerIdx = 0
+    for solutionIdx in range(1, self.numSolutions):
+      if scores[solutionIdx] < scores[winnerIdx]:
+        winnerIdx = solutionIdx
+    return winnerIdx
+
+
   ##############################################################################
+  # Score (microseconds) Range For Solutions
+  def scoreRangeForSolutions(self, indexRange):
+    scores = [0]*self.numSolutions
+    for problemIndices in self.problemIndicesForRange(indexRange):
+      problemSerial = self.indicesToSerial(0, problemIndices)
+      totalFlops = self.totalFlopsForProblemIndices(problemIndices)
+      for solutionIdx in range(0, self.numSolutions):
+        gflops = self.data[problemSerial+solutionIdx]
+        timeUs = totalFlops / gflops / 1000
+        scores[solutionIdx] += timeUs
+    return scores
+
+
+  ##############################################################################
+  # Score Logic Complexity
+  def scoreLogicComplexity(self, logic, logicComplexity):
+    depth = self.getLogicDepth(logic)
+    if depth == 0: return
+    depth = self.numIndices - depth
+    currentLogic = logic
+    for i in range(0, len(logic)):
+      logicComplexity[depth] += 1
+      self.scoreLogicComplexity(logic[i][1], logicComplexity)
+
+
+  ##############################################################################
+  # Get Logic Depth
+  def getLogicDepth(self, logic):
+    obj = logic
+    depth = 0
+    while isinstance(obj, list):
+      obj = obj[0][1]
+      depth += 1
+    return depth
+
+
+  ##############################################################################
+  # To Index Order
+  def toIndexOrder(self, problemIndices):
+    ordered = []
+    for i in self.indexOrder:
+      ordered.append(problemIndices[i])
+    return ordered
+
+
+  ##############################################################################
+  # Total Flops For Problem Indices
+  def totalFlopsForProblemIndices(self, problemIndices):
+    totalFlops = self.flopsPerMac
+    for i in range(0, self.numIndices):
+      totalFlops *= self.problemIndexToSize[i][problemIndices[i]]
+    return totalFlops
+
+
+  ##############################################################################
+  # Recommended Index Order
+  # TODO, this may depend on transposes
+  def recommendedIndexOrder(self):
+    order = []
+    for i in range(0, self.numIndices):
+      if i != self.idxU and i != self.idx1 and i != self.idx0:
+        order.append(i)
+    order.append(self.idxU)
+    order.append(self.idx0)
+    order.append(self.idx1)
+    return order
+
+  ##############################################################################
+  # Problem Indices For Range
+  def problemIndicesForRange(self, indexRange):
+    problemIndexList = []
+    problemIndices = []
+    for idx in indexRange:
+      problemIndices.append(idx[0])
+    moreProblems = True
+    while moreProblems:
+      problemIndexList.append(deepcopy(problemIndices))
+      # next problem
+      problemIndices[0] += 1
+      for i in range(0, self.numIndices):
+        if problemIndices[i] >= indexRange[i][1]:
+          if i == self.numIndices-1:
+            moreProblems = False
+            break
+          else:
+            problemIndices[i] = indexRange[i][0]
+            problemIndices[i+1] += 1
+        else:
+          break
+    return problemIndexList
+
+
+  ##############################################################################
+  # Get Size Free
   def getSizeFree(self, problemIndices):
     sizeFree = 1
-    for i in range(0, self.problemType["NumIndicesC"]):
+    for i in self.rangeIndicesFree:
       sizeFree *= self.problemIndexToSize[i][problemIndices[i]]
     return sizeFree
 
+
+  ##############################################################################
+  # Get Size Summation
   def getSizeSummation(self, problemIndices):
     sizeSummation = 1
-    for i in range(self.problemType["NumIndicesC"], \
-        self.problemType["TotalIndices"]):
+    for i in self.rangeIndicesSummation:
       sizeSummation *= self.problemIndexToSize[i][problemIndices[i]]
     return sizeSummation
 
-  ##############################################################################
-  # Print Data
-  ##############################################################################
-  def printData(self):
-    print2("serial; idxD0, idxD1, idxDU, idxOthers; sizeD0, sizeD1, sizeDU, sizeOthers; sol0, sol1, sol2, ...")
-    indices = [0]*self.numIndices
-    for serial in range(0, self.totalProblems):
-      s = "[%4u] [%2u" % (serial, indices[0])
-      for i in range(1, self.numIndices):
-        s += ", %2u" % indices[i]
-      s += "] [%4u" % self.problemIndexToSize[0][indices[0]]
-      for i in range(1, self.numIndices):
-        s += ", %4u" % self.problemIndexToSize[i][indices[i]]
-      s += "]: %9.3f" % self.data[serial*self.numSolutions+0]
-      for i in range(1, self.numSolutions):
-        s += ", %9.3f" % self.data[serial*self.numSolutions+i]
-      print2(s)
-      indices[0] += 1
-      for i in range(1, self.numIndices):
-        if indices[i-1] >= self.numProblemSizes[i-1]:
-          indices[i-1] = 0
-          indices[i] += 1
 
   ##############################################################################
   # Get Item
-  ##############################################################################
   def __getitem__(self, indexTuple):
     indices = indexTuple[0] # in analysis order
     solutionIdx = indexTuple[1]
     serial = self.indicesToSerial(solutionIdx, indices)
     return self.data[serial]
 
+
   ##############################################################################
-  # Get Item
-  ##############################################################################
+  # Set Item
   def __setitem__(self, indexTuple, value):
     indices = indexTuple[0] # in analysis order
     solutionIdx = indexTuple[1]
     serial = self.indicesToSerial(solutionIdx, indices )
     self.data[serial] = value
 
+
   ##############################################################################
   # Indices -> Serial
-  ##############################################################################
   def indicesToSerial(self, solutionIdx, indices ):
     serial = 0
     stride = 1
@@ -588,8 +929,13 @@ def indicesToSerial(self, solutionIdx, indices ):
     return serial
 
 
+
+################################################################################
+################################################################################
+###
+###   Main
+###
 ################################################################################
-# Main
 ################################################################################
 def main(  config ):
   print2("# LibraryLogic config: %s" % config)
@@ -643,8 +989,17 @@ def main(  config ):
   # Run Analysis
   schedulePrefix = globalParameters["Name"]
   for problemTypeTuple in problemTypeTuples:
-    logic = analyzeProblemType( problemTypeTuple, analysisParameters )
+    logicTuple = analyzeProblemType( problemTypeTuple, analysisParameters )
     YAMLIO.writeLibraryLogicForProblemType(globalParameters["WorkingPath"], \
-        schedulePrefix, logic)
+        schedulePrefix, logicTuple)
 
   popWorkingPath()
+
+########################################
+# TODO
+# - different weights for different levels?
+#   are there pairs of weights that would result in same logic complexity but better score?
+
+########################################
+# TODO problems which this algorithm
+# - barrier to switching may not always be amortised on next step, need to calculate several steps into future to see if net win; process needs to be a search tree. 32x32 search only takes 1 second
diff --git a/Tensile/SolutionStructs.py b/Tensile/SolutionStructs.py
index b21f98728..e3d66be05 100644
--- a/Tensile/SolutionStructs.py
+++ b/Tensile/SolutionStructs.py
@@ -20,7 +20,7 @@
 ################################################################################
 
 
-from Common import globalParameters, defaultProblemType, assignParameterWithDefault, printExit, assignParameterRequired, defaultSolution, derrivedParameters
+from Common import globalParameters, defaultProblemType, assignParameterWithDefault, printExit, assignParameterRequired, defaultSolution, derivedParameters
 from copy import deepcopy
 
 ################################################################################
@@ -95,9 +95,9 @@ def zeroString(self, backend):
     zeroString = "(%s)(" % self.toDevice(backend)
     if self.value == self.single or self.value == self.half:
       zeroString += "0.f"
-    elif self.value == self.double: 
+    elif self.value == self.double:
       zeroString += "0.0"
-    elif self.value == self.complexSingle: 
+    elif self.value == self.complexSingle:
       zeroString += "0.f, 0.f"
     elif self.value == self.complexDouble:
       zeroString += "0.0, 0.0"
@@ -123,6 +123,8 @@ def numRegisters( self ):
     return self.properties[self.value][self.idxReg]
   def numBytes( self ):
     return self.numRegisters() * 4
+  def flopsPerMac(self):
+    return 2 if self.isReal() else 8
 
   def __str__(self):
     return self.toChar()
@@ -214,7 +216,8 @@ def __init__(self, config):
     elif self["OperationType"] == "TensorContraction":
       self.initTensorContraction(config)
 
-    self.assignIndices()
+    self.state["AssignedDerivedParameters"] = False
+    ProblemType.assignDerivedParameters(self.state)
 
 
   ########################################
@@ -249,90 +252,98 @@ def isTensorContraction(self):
 
   ########################################
   # determine d0, d1, dU
-  def assignIndices(self):
-    self["TotalIndices"] = max(max(self["IndexAssignmentsA"])+1, max(self["IndexAssignmentsB"])+1)
+  @staticmethod
+  def assignDerivedParameters(state):
+    if "AssignedDerivedParameters" in state:
+      if state["AssignedDerivedParameters"]:
+        return
+    state["AssignedDerivedParameters"] = False
+
+    state["TotalIndices"] = max(max(state["IndexAssignmentsA"])+1, \
+        max(state["IndexAssignmentsB"])+1)
 
     # determine num free, batch
-    self["IndicesFree"] = []
-    self["IndicesBatch"] = []
-    self["IndicesSummation"] = []
+    state["IndicesFree"] = []
+    state["IndicesBatch"] = []
+    state["IndicesSummation"] = []
 
-    for i in range(0, self["NumIndicesC"]):
-      inA = i in self["IndexAssignmentsA"]
-      inB = i in self["IndexAssignmentsB"]
+    for i in range(0, state["NumIndicesC"]):
+      inA = i in state["IndexAssignmentsA"]
+      inB = i in state["IndexAssignmentsB"]
       if inA and inB:
-        #self["NumIndicesBatch"] = (i+1)-self["NumIndicesFree"]
-        self["IndicesBatch"].append(i)
+        #state["NumIndicesBatch"] = (i+1)-state["NumIndicesFree"]
+        state["IndicesBatch"].append(i)
 
       elif inA or inB:
-        #self["NumIndicesFree"] = (i+1)
-        self["IndicesFree"].append(i)
+        #state["NumIndicesFree"] = (i+1)
+        state["IndicesFree"].append(i)
       else:
         printExit("invalid index %u" % i)
 
     # determine num summation
-    for i in range(self["NumIndicesC"], self["TotalIndices"]):
-      inA = i in self["IndexAssignmentsA"]
-      inB = i in self["IndexAssignmentsB"]
+    for i in range(state["NumIndicesC"], state["TotalIndices"]):
+      inA = i in state["IndexAssignmentsA"]
+      inB = i in state["IndexAssignmentsB"]
       if inA and inB:
-        #self["NumIndicesSummation"] = (i+1)-self["NumIndicesC"]
-        self.state["IndicesSummation"].append(i)
+        #state["NumIndicesSummation"] = (i+1)-state["NumIndicesC"]
+        state["IndicesSummation"].append(i)
       else:
         printExit("invalid index %u" % i)
-    self["NumIndicesFree"] = len(self["IndicesFree"])
-    self["NumIndicesBatch"] = len(self["IndicesBatch"])
-    self["NumIndicesSummation"] = len(self["IndicesSummation"])
+    state["NumIndicesFree"] = len(state["IndicesFree"])
+    state["NumIndicesBatch"] = len(state["IndicesBatch"])
+    state["NumIndicesSummation"] = len(state["IndicesSummation"])
 
 
     # by default, unroll index will be the first summation index
     # TODO sort summation indices by "stride"
-    self["IndexUnroll"] = self["IndicesSummation"][0]
-    for i in range(0, len(self["IndexAssignmentsA"])):
-      if self["IndexAssignmentsA"][i] == self["IndexUnroll"]:
-        self["IndexUnrollA"] = i
+    state["IndexUnroll"] = state["IndicesSummation"][0]
+    for i in range(0, len(state["IndexAssignmentsA"])):
+      if state["IndexAssignmentsA"][i] == state["IndexUnroll"]:
+        state["IndexUnrollA"] = i
         break
-    for i in range(0, len(self["IndexAssignmentsB"])):
-      if self["IndexAssignmentsB"][i] == self["IndexUnroll"]:
-        self["IndexUnrollB"] = i
+    for i in range(0, len(state["IndexAssignmentsB"])):
+      if state["IndexAssignmentsB"][i] == state["IndexUnroll"]:
+        state["IndexUnrollB"] = i
         break
 
     # assign d0, d1
-    self["Index01A"] = -1
-    self["Index01B"] = -1
-    for i in self["IndexAssignmentsA"]:
-      if i < self["NumIndicesC"]:
-        self["Index01A"] = i
+    state["Index01A"] = -1
+    state["Index01B"] = -1
+    for i in state["IndexAssignmentsA"]:
+      if i < state["NumIndicesC"]:
+        state["Index01A"] = i
         break
-    for i in self["IndexAssignmentsB"]:
-      if i < self["NumIndicesC"]:
-        self["Index01B"] = i
+    for i in state["IndexAssignmentsB"]:
+      if i < state["NumIndicesC"]:
+        state["Index01B"] = i
         break
     # whichever has lower stride in C (lower value), is 0, other is 1
-    if self["Index01A"] < self["Index01B"]:
-      self["Index0"]  = self["Index01A"]
-      self["Index1"]  = self["Index01B"]
-      self["Tensor0"] = 0
-      self["Tensor1"] = 1
-      self["TileA"] = 0
-      self["TileB"] = 1
+    if state["Index01A"] < state["Index01B"]:
+      state["Index0"]  = state["Index01A"]
+      state["Index1"]  = state["Index01B"]
+      state["Tensor0"] = 0
+      state["Tensor1"] = 1
+      state["TileA"] = 0
+      state["TileB"] = 1
     else:
-      self["Index0"]  = self["Index01B"]
-      self["Index1"]  = self["Index01A"]
-      self["Tensor0"] = 1
-      self["Tensor1"] = 0
-      self["TileA"] = 1
-      self["TileB"] = 0
+      state["Index0"]  = state["Index01B"]
+      state["Index1"]  = state["Index01A"]
+      state["Tensor0"] = 1
+      state["Tensor1"] = 0
+      state["TileA"] = 1
+      state["TileB"] = 0
 
     # generalize transpose
-    strideIdxA = self["IndexAssignmentsA"].index(self["Index01A"])
-    strideIdxB = self["IndexAssignmentsB"].index(self["Index01B"])
-    unrollIdxA = self["IndexAssignmentsA"].index(self["IndexUnroll"])
-    unrollIdxB = self["IndexAssignmentsB"].index(self["IndexUnroll"])
-    self["TLUA"] = strideIdxA < unrollIdxA
-    self["TLUB"] = strideIdxB < unrollIdxB
+    strideIdxA = state["IndexAssignmentsA"].index(state["Index01A"])
+    strideIdxB = state["IndexAssignmentsB"].index(state["Index01B"])
+    unrollIdxA = state["IndexAssignmentsA"].index(state["IndexUnroll"])
+    unrollIdxB = state["IndexAssignmentsB"].index(state["IndexUnroll"])
+    state["TLUA"] = strideIdxA < unrollIdxA
+    state["TLUB"] = strideIdxB < unrollIdxB
 
     #unrollDimStrideGreaterThanTileDimStrideA = TLUA
     #unrollDimStrideLessThanTileDimStrideB    = !TLUB
+    state["AssignedDerivedParameters"] = True
 
 
 
@@ -364,6 +375,18 @@ def __str__(self):
     if self["UseInitialStrides"]: name += "I"
     return name
 
+  def keys(self):
+    return self.state.keys()
+  def __len__(self):
+    return len(self.state)
+  def __iter__(self):
+    return iter(self.state)
+
+
+
+
+
+
   def __getitem__(self, key):
     return self.state[key]
   def __setitem__(self, key, value):
@@ -504,8 +527,10 @@ def __init__(self, config):
     for key in config:
       if key != "ProblemType" and key not in self.state:
         self.state[key] = config[key]
-
-    Solution.assignDimsFromEdgeAndShape(self.state)
+    self["Valid"] = True
+    self["AssignedProblemIndependentDerivedParameters"] = False
+    self["AssignedDerivedParameters"] = False
+    Solution.assignDerivedParameters(self.state)
 
   ########################################
   # get a list of kernel parameters for this solution
@@ -529,24 +554,28 @@ def getKernels(self):
 
 
   ########################################
-  # assign Dim0, 1 based on edge and shape
+  # assign tile sizes
   @staticmethod
-  def assignDimsFromEdgeAndShape(state):
+  def assignProblemIndependentDerivedParameters(state):
+    if "AssignedProblemIndependentDerivedParameters" in state:
+      if state["AssignedProblemIndependentDerivedParameters"]:
+        return
+    state["AssignedProblemIndependentDerivedParameters"] = False
     # workgroup sizes
     state["WorkGroup0"] = state["WorkGroupEdge"]
     state["WorkGroup1"] = state["WorkGroupEdge"]
-    if state["WorkGroupShape"] == 1:
-      state["WorkGroup1"] *= 2
-    elif state["WorkGroupShape"] == -1:
-      state["WorkGroup0"] *= 2
+    if state["WorkGroupShape"] > 0:
+      state["WorkGroup1"] *= abs(state["WorkGroupShape"])
+    elif state["WorkGroupShape"] < 0:
+      state["WorkGroup0"] *= abs(state["WorkGroupShape"])
 
     # thread tile sizes
     state["ThreadTile0"] = state["ThreadTileEdge"]
     state["ThreadTile1"] = state["ThreadTileEdge"]
-    if state["ThreadTileShape"] == 1:
-      state["ThreadTile1"] *= 2
-    elif state["ThreadTileShape"] == -1:
-      state["ThreadTile0"] *= 2
+    if state["ThreadTileShape"] > 0:
+      state["ThreadTile1"] *= abs(state["ThreadTileShape"])
+    elif state["ThreadTileShape"] < 0:
+      state["ThreadTile0"] *= abs(state["ThreadTileShape"])
 
     # macro tile sizes
     if "WorkGroup0" in state and "ThreadTile0" in state:
@@ -556,6 +585,267 @@ def assignDimsFromEdgeAndShape(state):
     if "SplitU" in state and "LoopUnroll" in state:
       state["DepthU"] = state["SplitU"] * state["LoopUnroll"]
 
+    printReason = False
+    # num threads
+    state["NumThreads"] = state["WorkGroup0"]*state["WorkGroup1"]
+    if state["NumThreads"] > globalParameters["MaxThreads"]:
+      if printReason: print2("rejecting %u threads" % state["NumThreads"])
+      state["Valid"] = False
+    if state["NumThreads"] < globalParameters["MinThreads"]:
+      if printReason: print2("rejecting %u threads" % state["NumThreads"])
+      state["Valid"] = False
+
+    # tile shape
+    if state["MacroTile0"]/state["MacroTile1"] > globalParameters["MaxMacroTileRatio"] \
+        or state["MacroTile1"]/state["MacroTile0"] > globalParameters["MaxMacroTileRatio"]:
+      state["Valid"] = False
+
+    # done
+    state["AssignedProblemIndependentDerivedParameters"] = True
+
+
+  ########################################
+  # assign all derived parameters
+  @staticmethod
+  def assignDerivedParameters(state):
+    Solution.assignProblemIndependentDerivedParameters(state)
+    if "AssignedDerivedParameters" in state:
+      if state["AssignedDerivedParameters"]:
+        return
+    state["AssignedDerivedParameters"] = False
+
+    ProblemType.assignDerivedParameters(state["ProblemType"])
+    printReason = False
+
+    # tile size
+    if state["ThreadTile0"]*state["ThreadTile1"]*state["ProblemType"]["DataType"].numRegisters() > globalParameters["MaxThreadTile"]:
+      state["Valid"] = False
+
+    # how many elements to load
+    if state["ProblemType"]["TLUA"]:
+      totalElementsCoalescedA = state["MacroTile0"]
+      totalElementsPerpA = state["LoopUnroll"]
+    else:
+      totalElementsCoalescedA = state["LoopUnroll"]
+      totalElementsPerpA = state["MacroTile0"]
+
+    if state["ProblemType"]["TLUB"]:
+      totalElementsCoalescedB = state["MacroTile1"]
+      totalElementsPerpB = state["LoopUnroll"]
+    else:
+      totalElementsCoalescedB = state["LoopUnroll"]
+      totalElementsPerpB = state["MacroTile1"]
+    totalElementsA = totalElementsCoalescedA * totalElementsPerpA
+    totalElementsB = totalElementsCoalescedB * totalElementsPerpB
+
+    # how many load instructions
+    if totalElementsA % state["NumThreads"] != 0:
+      if printReason: print2("totalElementsA %u %% NumThreads %u != 0" \
+          % (totalElementsA, state["NumThreads"]))
+      state["Valid"] = False
+      return
+    else:
+      state["NumLoadsA"] = totalElementsA / state["NumThreads"]
+    if totalElementsB % state["NumThreads"] != 0:
+      if printReason: print2("totalElementsB %u %% NumThreads %u != 0" \
+          % (totalElementsB, state["NumThreads"]))
+      state["Valid"] = False
+      return
+      state["NumLoadsB"] = totalElementsB / state["NumThreads"]
+    else:
+      state["NumLoadsB"] = totalElementsB / state["NumThreads"]
+
+    # nlca = 1
+    if state["NumLoadsCoalescedA"] == 1:
+      foundValid = False
+      for nlca in range(1, state["NumLoadsA"]+1):
+        nlpa = state["NumLoadsA"] / nlca
+        if state["NumLoadsA"] % nlca == 0 \
+            and totalElementsCoalescedA % nlca == 0 \
+            and totalElementsPerpA % nlpa == 0:
+          state["NumLoadsCoalescedA"] = nlca
+          state["NumLoadsPerpendicularA"] = nlpa
+          foundValid = True
+          break
+      if not foundValid:
+        state["Valid"] = False
+        return
+
+    # nlca = -1
+    elif state["NumLoadsCoalescedA"] == -1:
+      foundValid = False
+      for nlca in range(state["NumLoadsA"], 0, -1):
+        nlpa = state["NumLoadsA"] / nlca
+        if state["NumLoadsA"] % nlca == 0 \
+            and totalElementsCoalescedA % nlca == 0 \
+            and totalElementsPerpA % nlpa == 0:
+          state["NumLoadsCoalescedA"] = nlca
+          state["NumLoadsPerpendicularA"] = nlpa
+          foundValid = True
+          break
+      if not foundValid:
+        state["Valid"] = False
+        return
+
+    # nlca = other
+    else:
+      state["NumLoadsPerpendicularA"] = state["NumLoadsA"] \
+          / state["NumLoadsCoalescedA"]
+
+      if state["NumLoadsA"] % state["NumLoadsCoalescedA"] != 0:
+        if printReason: print2("numLoadsA %u %% numLoadsParaA %u != 0" \
+            % (state["NumLoadsA"], state["NumLoadsCoalescedA"]))
+        state["Valid"] = False
+      if totalElementsCoalescedA % state["NumLoadsCoalescedA"] != 0:
+        if printReason: print2("totalElementsCoalescedA %u %% numLoadsParaA %u != 0" \
+            % (totalElementsCoalescedA, state["NumLoadsCoalescedA"]))
+        state["Valid"] = False
+        return
+      if totalElementsPerpA % state["NumLoadsPerpendicularA"] != 0:
+        if printReason: print2("totalElementsPerpA %u %% numLoadsPerpA %u != 0" \
+            % (totalElementsPerpA, state["NumLoadsPerpendicularA"]))
+        state["Valid"] = False
+        return
+
+
+
+
+
+    # nlcb = 1
+    if state["NumLoadsCoalescedB"] == 1:
+      foundValid = False
+      for nlca in range(1, state["NumLoadsB"]+1):
+        nlpa = state["NumLoadsB"] / nlca
+        if state["NumLoadsB"] % nlca == 0 \
+            and totalElementsCoalescedB % nlca == 0 \
+            and totalElementsPerpB % nlpa == 0:
+          state["NumLoadsCoalescedB"] = nlca
+          state["NumLoadsPerpendicularB"] = nlpa
+          foundValid = True
+          break
+      if not foundValid:
+        state["Valid"] = False
+        return
+
+    # nlcb = -1
+    elif state["NumLoadsCoalescedB"] == -1:
+      foundValid = False
+      for nlca in range(state["NumLoadsB"], 0, -1):
+        nlpa = state["NumLoadsB"] / nlca
+        if state["NumLoadsB"] % nlca == 0 \
+            and totalElementsCoalescedB % nlca == 0 \
+            and totalElementsPerpB % nlpa == 0:
+          state["NumLoadsCoalescedB"] = nlca
+          state["NumLoadsPerpendicularB"] = nlpa
+          foundValid = True
+          break
+      if not foundValid:
+        state["Valid"] = False
+        return
+
+    # nlcb = other
+    else:
+      state["NumLoadsPerpendicularB"] = state["NumLoadsB"] \
+          / state["NumLoadsCoalescedB"]
+
+      if state["NumLoadsB"] % state["NumLoadsCoalescedB"] != 0:
+        if printReason: print2("numLoadsB %u %% numLoadsParaB %u != 0" \
+            % (state["NumLoadsB"], state["NumLoadsCoalescedB"]))
+        state["Valid"] = False
+      if totalElementsCoalescedB % state["NumLoadsCoalescedB"] != 0:
+        if printReason: print2("totalElementsCoalescedB %u %% numLoadsParaB %u != 0" \
+            % (totalElementsCoalescedB, state["NumLoadsCoalescedB"]))
+        state["Valid"] = False
+        return
+      if totalElementsPerpB % state["NumLoadsPerpendicularB"] != 0:
+        if printReason: print2("totalElementsPerpB %u %% numLoadsPerpB %u != 0" \
+            % (totalElementsPerpB, state["NumLoadsPerpendicularB"]))
+        state["Valid"] = False
+        return
+
+
+
+
+
+
+
+
+    """
+    if state["NumLoadsCoalescedB"] < 1:
+      state["NumLoadsCoalescedB"] = state["NumLoadsB"]
+    if state["NumLoadsB"] % state["NumLoadsCoalescedB"] != 0:
+      if printReason: print2("numLoadsB %u %% numLoadsParaB %u != 0" \
+          % (state["NumLoadsB"], state["NumLoadsCoalescedB"]))
+      state["Valid"] = False
+      return
+    else:
+      state["NumLoadsPerpendicularB"] = state["NumLoadsB"] \
+          / state["NumLoadsCoalescedB"]
+
+
+    # load size para/perp B
+    if totalElementsCoalescedB % state["NumLoadsCoalescedB"] != 0:
+      if printReason: print2("totalElementsCoalescedB %u %% numLoadsParaB %u != 0" \
+          % (totalElementsCoalescedB, state["NumLoadsCoalescedB"]))
+      state["Valid"] = False
+      return
+    #else:
+    #  loadSizeParaB = totalElementsCoalescedB / state["NumLoadsCoalescedB"]
+    if totalElementsPerpB % state["NumLoadsPerpendicularB"] != 0:
+      if printReason: print2("totalElementsPerpB %u %% numLoadsPerpB %u != 0" \
+          % (totalElementsPerpB, state["NumLoadsPerpendicularB"]))
+      state["Valid"] = False
+      return
+    #else:
+    #  loadSizePerpB = totalElementsPerpB / state["NumLoadsPerpendicularB"]
+    """
+
+
+
+
+
+
+
+
+
+    # too much LDS
+    sizeLDS = state["LoopUnroll"] \
+        * (state["PadLDS"] * 2 + state["MacroTile0"] \
+        + state["MacroTile1"] ) \
+        * state["ProblemType"]["DataType"].numBytes()
+    if sizeLDS > globalParameters["MaxLDS"]:
+      if printReason: print2("Kernel Uses %u > %u bytes" % ( sizeLDS, globalParameters["MaxLDS"]))
+      state["Valid"] = False
+      return
+
+    # Compiler may be causing incorrect spills on ROCm1.4 from DT on 2/21/17
+    if globalParameters["Backend"] == "HIP":
+      if state["ProblemType"]["DataType"].value == DataType.single:
+        if state["MacroTile0"] == 128 or state["MacroTile1"] == 128:
+          if state["NumLoadsCoalescedA"] != 1 and state["NumLoadsCoalescedB"] != 8:
+            state["Valid"] = False
+            #return
+      elif state["ProblemType"]["DataType"].value == DataType.double:
+        if globalParameters["Backend"] == "HIP":
+          if state["MacroTile0"] >= 64 or state["MacroTile1"] >= 64:
+            state["Valid"] = False
+            #return
+    state["AssignedDerivedParameters"] = True
+
+
+# validation failures
+# Cijk_Ailk_Bjlk_SB_DU16_LU16_MT064_MT164_NLA16_NLB16_NLCA02_NLCB01_NLPA08_NLPB16_TT008_TT108_TTE08_WG008_WG108_WGE08
+# Cijk_Ailk_Bjlk_SB_DU16_LU16_MT064_MT164_NLA16_NLB16_NLCA04_NLCB02_NLPA04_NLPB08_TT008_TT108_TTE08_WG008_WG108_WGE08
+# Cijk_Ailk_Bjlk_SB_DU16_LU16_MT064_MT164_NLA16_NLB16_NLCA02_NLCB04_NLPA08_NLPB04_TT008_TT108_TTE08_WG008_WG108_WGE08
+
+# Cijk_Ailk_Bjlk_DB_DU16_LU16_MT064_MT164_NLA16_NLB16_NLCA04_NLCB01_NLPA04_NLPB16_TT008_TT108_TTE08_WG008_WG108_WGE08
+# Cijk_Ailk_Bjlk_DB_DU08_LU08_MT064_MT164_NLA08_NLB08_NLCA01_NLCB01_NLPA08_NLPB08_TT008_TT108_TTE08_WG008_WG108_WGE08
+# Cijk_Ailk_Bjlk_DB_DU08_LU08_MT064_MT164_NLA08_NLB08_NLCA08_NLCB01_NLPA01_NLPB08_TT008_TT108_TTE08_WG008_WG108_WGE08
+# Cijk_Ailk_Bjlk_DB_DU08_LU08_MT064_MT164_NLA08_NLB08_NLCA08_NLCB08_NLPA01_NLPB01_TT008_TT108_TTE08_WG008_WG108_WGE08
+# Cijk_Ailk_Bjlk_DB_DU16_LU16_MT064_MT164_NLA16_NLB16_NLCA08_NLCB08_NLPA02_NLPB02_TT008_TT108_TTE08_WG008_WG108_WGE08
+# Cijk_Ailk_Bjlk_DB_DU08_LU08_MT064_MT164_NLA08_NLB08_NLCA01_NLCB08_NLPA08_NLPB01_TT008_TT108_TTE08_WG008_WG108_WGE08
+
+
 
   ########################################
   # create a dictionary with booleans on whether to include parameter in name
@@ -573,14 +863,16 @@ def getMinNaming(objs):
     # only 1, rather than name being nothing, it'll be everything
     if len(objs) == 1:
       for key in keys:
-        requiredParameters[key] = False
+        if key not in derivedParameters:
+          requiredParameters[key] = False
     else:
       for key in keys:
         required = False
-        for i in range(1, len(objs)):
-          if objs[0][key] != objs[i][key]:
-            required = True
-            break
+        if key not in derivedParameters:
+          for i in range(1, len(objs)):
+            if objs[0][key] != objs[i][key]:
+              required = True
+              break
         if required:
           requiredParameters[key] = True
         else:
@@ -617,8 +909,6 @@ def getNameMin(state, requiredParameters):
             first = False
           name += "%s%s" % ( Solution.getParameterNameAbbreviation(key), \
               Solution.getParameterValueAbbreviation(state[key]) )
-      #else:
-      #  print "%s not in %s" % (key, requiredParameters)
     return name
 
   ########################################
@@ -627,26 +917,20 @@ def getNameMin(state, requiredParameters):
   def getSerialNaming(objs):
     data = {}
     for objIdx in range(0, len(objs)):
-      #print "ObjIdx: %u" % objIdx
       obj = objs[objIdx]
       for paramName in sorted(obj.keys()):
-        if paramName not in derrivedParameters:
+        if paramName not in derivedParameters:
           paramValue = obj[paramName]
-          #if paramName == "ThreadTileEdge":
-          #  print "%s = %s" % (paramName, paramValue)
           if paramName in data:
             if paramValue not in data[paramName]:
               data[paramName].append(paramValue)
           else:
             data[paramName] = [ paramValue ]
     maxObjs = 1
-    #print "SerialNaming:"
     for paramName in data:
       data[paramName] = sorted(data[paramName])
-      #print "%s: %s" % (paramName, data[paramName])
       maxObjs *= len(data[paramName])
     numDigits = len(str(maxObjs))
-    #print "MaxSerialNames: %u (%u)" % (maxObjs, numDigits)
     return [ data, numDigits ]
 
   ########################################
@@ -659,29 +943,16 @@ def getNameSerial(state, serialNaming):
     serial = 0
     multiplier = 1
     for paramName in sorted(state.keys()):
-      if paramName not in derrivedParameters:
+      if paramName not in derivedParameters:
         paramValue = state[paramName]
         paramData = data[paramName]
         paramNameMultiplier = len(paramData)
         if paramValue in paramData:
           paramValueIdx = paramData.index(paramValue)
-        #else:
-          #print "ERROR %s: %s not in %s" % ( paramName, paramValue, paramData )
-          #print state
-          #printExit()
-        #if paramNameMultiplier > 1:
-          #print "serial = %u*%u + %u; multiplier = %u * %u; %s::%s in %s" % ( \
-          #    paramValueIdx, multiplier, serial, \
-          #    paramNameMultiplier, multiplier, \
-          #    paramName, paramValue, paramData[1] )
-
         serial += paramValueIdx * multiplier
         multiplier *= paramNameMultiplier
-    #if serial == 0:
-    #  print state
     name = "%s%0*u" % ("S" if isinstance(state, Solution) else "K", \
         numDigits, serial)
-    #print "SerialName: %s" % name
     return name
 
 
@@ -746,7 +1017,8 @@ def __hash__(self):
     return hash(str(self))
     #return hash(self.getAttributes())
   def __eq__(self, other):
-    return isinstance(other, Solution) and self.getAttributes() == other.getAttributes()
+    #return isinstance(other, Solution) and self.getAttributes() == other.getAttributes()
+    return isinstance(other, Solution) and str(self) == str(other)
   def __ne__(self, other):
     result = self.__eq__(other)
     if result is NotImplemented:
diff --git a/Tensile/SolutionWriter.py b/Tensile/SolutionWriter.py
index c09b72d70..ea7585b57 100644
--- a/Tensile/SolutionWriter.py
+++ b/Tensile/SolutionWriter.py
@@ -787,123 +787,3 @@ def getHeaderFileString(self, solution):
     return fileStr
 
 
-  ##############################################################################
-  # are solution parameters (dict) self-consistent
-  ##############################################################################
-  @ staticmethod
-  def solutionParametersConsistent(solution):
-    printReason = False
-
-    numThreads = solution["WorkGroup0"]*solution["WorkGroup1"]
-    if numThreads > globalParameters["MaxThreads"]:
-      if printReason: print2("rejecting %u threads" % numThreads)
-      return False
-
-    # how many elements to load
-    if solution["ProblemType"]["TLUA"]:
-      totalElementsParaA = solution["MacroTile0"]
-      totalElementsPerpA = solution["LoopUnroll"]
-    else:
-      totalElementsParaA = solution["LoopUnroll"]
-      totalElementsPerpA = solution["MacroTile0"]
-
-    if solution["ProblemType"]["TLUB"]:
-      totalElementsParaB = solution["MacroTile1"]
-      totalElementsPerpB = solution["LoopUnroll"]
-    else:
-      totalElementsParaB = solution["LoopUnroll"]
-      totalElementsPerpB = solution["MacroTile1"]
-    totalElementsA = totalElementsParaA * totalElementsPerpA
-    totalElementsB = totalElementsParaB * totalElementsPerpB
-
-    # how many load instructions
-    if totalElementsA % numThreads != 0:
-      if printReason: print2("totalElementsA %u %% numThreads %u != 0" \
-          % (totalElementsA, numThreads))
-      return False
-    else:
-      solution["NumLoadsA"] = totalElementsA / numThreads
-    if totalElementsB % numThreads != 0:
-      if printReason: print2("totalElementsB %u %% numThreads %u != 0" \
-          % (totalElementsB, numThreads))
-      return False
-    else:
-      solution["NumLoadsB"] = totalElementsB / numThreads
-
-      # how many loads para
-      if solution["NumLoadsA"] % solution["NumLoadsCoalescedA"] != 0:
-        if printReason: print2("numLoadsA %u %% numLoadsParaA %u != 0" \
-            % (solution["NumLoadsA"], solution["NumLoadsCoalescedA"]))
-        return False
-      else:
-        solution["NumLoadsPerpendicularA"] = solution["NumLoadsA"] \
-            / solution["NumLoadsCoalescedA"]
-      if solution["NumLoadsB"] % solution["NumLoadsCoalescedB"] != 0:
-        if printReason: print2("numLoadsB %u %% numLoadsParaB %u != 0" \
-            % (solution["NumLoadsB"], solution["NumLoadsCoalescedB"]))
-        return False
-      else:
-        solution["NumLoadsPerpendicularB"] = solution["NumLoadsB"] \
-            / solution["NumLoadsCoalescedB"]
-
-    # load size para/perp A
-    if totalElementsParaA % solution["NumLoadsCoalescedA"] != 0:
-      if printReason: print2("totalElementsParaA %u %% numLoadsParaA %u != 0" \
-          % (totalElementsParaA, solution["NumLoadsCoalescedA"]))
-      return False
-    #else:
-    #  loadSizeParaA = totalElementsParaA / solution["NumLoadsCoalescedA"]
-    if totalElementsPerpA % solution["NumLoadsPerpendicularA"] != 0:
-      if printReason: print2("totalElementsPerpA %u %% numLoadsPerpA %u != 0" \
-          % (totalElementsPerpA, solution["NumLoadsPerpendicularA"]))
-      return False
-    #else:
-    #  loadSizePerpA = totalElementsPerpA / solution["NumLoadsPerpendicularA"]
-
-    # load size para/perp B
-    if totalElementsParaB % solution["NumLoadsCoalescedB"] != 0:
-      if printReason: print2("totalElementsParaB %u %% numLoadsParaB %u != 0" \
-          % (totalElementsParaB, solution["NumLoadsCoalescedB"]))
-      return False
-    #else:
-    #  loadSizeParaB = totalElementsParaB / solution["NumLoadsCoalescedB"]
-    if totalElementsPerpB % solution["NumLoadsPerpendicularB"] != 0:
-      if printReason: print2("totalElementsPerpB %u %% numLoadsPerpB %u != 0" \
-          % (totalElementsPerpB, solution["NumLoadsPerpendicularB"]))
-      return False
-    #else:
-    #  loadSizePerpB = totalElementsPerpB / solution["NumLoadsPerpendicularB"]
-
-    # too much LDS
-    sizeLDS = solution["LoopUnroll"] \
-        * (solution["PadLDS"] * 2 + solution["MacroTile0"] \
-        + solution["MacroTile1"] ) \
-        * solution["ProblemType"]["DataType"].numBytes()
-    if sizeLDS > globalParameters["MaxLDS"]:
-      if printReason: print2("Kernel Uses %u > %u bytes" % ( sizeLDS, globalParameters["MaxLDS"]))
-      return False
-
-    # Compiler may be causing incorrect spills on ROCm1.4 from DT on 2/21/17
-    if globalParameters["Backend"] == "HIP":
-      if solution["ProblemType"]["DataType"].value == DataType.single:
-        if solution["MacroTile0"] == 128 or solution["MacroTile1"] == 128:
-          if solution["NumLoadsCoalescedA"] != 1 and solution["NumLoadsCoalescedB"] != 8:
-            return False
-      elif solution["ProblemType"]["DataType"].value == DataType.double:
-        if globalParameters["Backend"] == "HIP":
-          if solution["MacroTile0"] >= 64 or solution["MacroTile1"] >= 64:
-            return False
-# validation failures
-# Cijk_Ailk_Bjlk_SB_DU16_LU16_MT064_MT164_NLA16_NLB16_NLCA02_NLCB01_NLPA08_NLPB16_TT008_TT108_TTE08_WG008_WG108_WGE08
-# Cijk_Ailk_Bjlk_SB_DU16_LU16_MT064_MT164_NLA16_NLB16_NLCA04_NLCB02_NLPA04_NLPB08_TT008_TT108_TTE08_WG008_WG108_WGE08
-# Cijk_Ailk_Bjlk_SB_DU16_LU16_MT064_MT164_NLA16_NLB16_NLCA02_NLCB04_NLPA08_NLPB04_TT008_TT108_TTE08_WG008_WG108_WGE08
-
-# Cijk_Ailk_Bjlk_DB_DU16_LU16_MT064_MT164_NLA16_NLB16_NLCA04_NLCB01_NLPA04_NLPB16_TT008_TT108_TTE08_WG008_WG108_WGE08
-# Cijk_Ailk_Bjlk_DB_DU08_LU08_MT064_MT164_NLA08_NLB08_NLCA01_NLCB01_NLPA08_NLPB08_TT008_TT108_TTE08_WG008_WG108_WGE08
-# Cijk_Ailk_Bjlk_DB_DU08_LU08_MT064_MT164_NLA08_NLB08_NLCA08_NLCB01_NLPA01_NLPB08_TT008_TT108_TTE08_WG008_WG108_WGE08
-# Cijk_Ailk_Bjlk_DB_DU08_LU08_MT064_MT164_NLA08_NLB08_NLCA08_NLCB08_NLPA01_NLPB01_TT008_TT108_TTE08_WG008_WG108_WGE08
-# Cijk_Ailk_Bjlk_DB_DU16_LU16_MT064_MT164_NLA16_NLB16_NLCA08_NLCB08_NLPA02_NLPB02_TT008_TT108_TTE08_WG008_WG108_WGE08
-# Cijk_Ailk_Bjlk_DB_DU08_LU08_MT064_MT164_NLA08_NLB08_NLCA01_NLCB08_NLPA08_NLPB01_TT008_TT108_TTE08_WG008_WG108_WGE08
-
-    return True
-
diff --git a/Tensile/Source/Client.h b/Tensile/Source/Client.h
index eee665ca4..401b6ab8f 100644
--- a/Tensile/Source/Client.h
+++ b/Tensile/Source/Client.h
@@ -30,6 +30,7 @@
 #include <fstream>
 
 TensileTimer timer;
+TensileTimer apiTimer;
 std::ofstream file;
 
 void initControls();
@@ -135,7 +136,8 @@ bool callLibrary(
             std::cout << "  Device | Reference" << std::endl;
             firstPrint = false;
           }
-          std::cout << "[" << (numChecked-1) << "] " << i << ": " << tensileToString(deviceOnHostC[i])
+          std::cout << "[" << (numChecked-1) << "] " << i << ": "
+            << tensileToString(deviceOnHostC[i])
             << (equal ? "==" : "!=") << tensileToString(referenceC[i])
             << std::endl;
           printIdx++;
@@ -146,10 +148,14 @@ bool callLibrary(
 
   // time solution
   timer.start();
+  double apiTimeUs = 0;
   for (unsigned int syncIdx = 0; syncIdx < numSyncsPerBenchmark; syncIdx++) {
+    apiTimer.start();
     for (unsigned int enqIdx = 0; enqIdx < numEnqueuesPerSync; enqIdx++) {
       generatedCallToFunction( userSizes, alpha, beta );
     }
+    double currentApiTimeUs = apiTimer.elapsed_us() / numEnqueuesPerSync;
+    apiTimeUs += currentApiTimeUs;
     // sync
 #if Tensile_BACKEND_OCL
     status = clFinish(stream); tensileStatusCheck(status);
@@ -158,6 +164,7 @@ bool callLibrary(
 #endif
     tensileStatusCheck(status);
   } // sync loop
+  apiTimeUs /= numSyncsPerBenchmark;
 
   double timeMs = timer.elapsed_ms()
     / numSyncsPerBenchmark / numEnqueuesPerSync;
@@ -174,30 +181,35 @@ bool callLibrary(
       << numFunctions << "]:"
       << std::setw(10) << std::fixed << std::setprecision(3)
       << gflops << " GFlop/s";
-      if (newFastest) {
-        std::cout << "*";
-      } else {
-        std::cout << " ";
-      }
+    if (newFastest) {
+      std::cout << "*";
+    } else {
+      std::cout << " ";
+    }
     std::cout << " |"
       << std::setw(9) << std::fixed << std::setprecision(3) << timeMs
       << " ms | v: " << (numInvalids ? "FAILED" : "PASSED")
-      << " p: " << (numChecked-numInvalids) << "/" << numChecked << std::endl;
+      << " " << (numChecked-numInvalids) << "/" << numChecked;
+    std::cout << " | api:" << std::setw(6) << std::fixed 
+      << std::setprecision(3) << apiTimeUs << " us";
+    std::cout << std::endl;
   } else {
     std::cout << "Function[" << functionIdx << "/" << numFunctions << "]:"
       << std::setw(10) << std::fixed << std::setprecision(3)
       << gflops << " GFlop/s";
-      if (newFastest) {
-        std::cout << "*";
-      } else {
-        std::cout << " ";
-      }
+    if (newFastest) {
+      std::cout << "*";
+    } else {
+      std::cout << " ";
+    }
     std::cout << " |"
       << std::setw(9) << std::fixed << std::setprecision(3) << timeMs << " ms";
-      if (newFastest) {
-        std::cout << "*";
-      }
-      std::cout << std::endl;
+    if (newFastest) {
+      std::cout << "*";
+    }
+    std::cout << " | api:" << std::setw(6) << std::fixed 
+      << std::setprecision(3) << apiTimeUs << " us";
+    std::cout << std::endl;
   }
   return (numInvalids > 0);
 } // callLibrary
@@ -359,7 +371,7 @@ bool benchmarkAllSolutionsForSize(
       }
       std::cout << " |"
         << std::setw(9) << std::fixed << std::setprecision(3) << timeMs << " ms | v: " << (numInvalids ? "FAILED" : "PASSED")
-        << " p: " << (numChecked-numInvalids) << "/" << numChecked << std::endl;
+        << " " << (numChecked-numInvalids) << "/" << numChecked << std::endl;
     }
 #if 1
     else {
diff --git a/Tensile/TensileCreateLibrary.py b/Tensile/TensileCreateLibrary.py
index ef7c13392..222a71d55 100644
--- a/Tensile/TensileCreateLibrary.py
+++ b/Tensile/TensileCreateLibrary.py
@@ -171,9 +171,8 @@ def writeLogic(outputPath, logicList, solutionWriter ):
     scheduleName = logicProblemType[0]
     problemType = logicProblemType[1]
     solutions = logicProblemType[2]
-    skinnyLogic0 = logicProblemType[3]
-    skinnyLogic1 = logicProblemType[4]
-    diagonalLogic = logicProblemType[5]
+    indexOrder = logicProblemType[3]
+    logic = logicProblemType[4]
 
     # solution names
     solutionNames = []
@@ -208,6 +207,7 @@ def writeLogic(outputPath, logicList, solutionWriter ):
     for i in range(0, len(argList)):
       s += "    %s%s" % (argList[i], ",\n" if i < len(argList)-1 else ") {\n\n")
 
+    """
     indent = "  "
     s += "%ssize_t sizeC = size%s" % ( indent, indexChars[0])
     for i in range(1, problemType["NumIndicesC"]):
@@ -218,56 +218,11 @@ def writeLogic(outputPath, logicList, solutionWriter ):
     for i in range(1, len(problemType["IndicesSummation"])):
       s += "*size%s" % indexChars[problemType["IndicesSummation"][i]]
     s += ";\n\n"
-    for rule in skinnyLogic0:
-      print2(rule)
-    for rule in skinnyLogic1:
-      print2(rule)
+    """
     print2(solutionNames)
 
-    for ruleIdx in range(0, len(diagonalLogic)):
-      rule = diagonalLogic[ruleIdx]
-      print2(rule)
-      winnerIdx = rule[0]
-      problemSize = rule[1]
-      minGFlops = rule[2]
-      maxGFlops = rule[3]
-      # rule logic
-      if ruleIdx == len(diagonalLogic)-1:
-        if len(diagonalLogic) > 1:
-          s += "%selse" % indent
-        else:
-          s += "%s" % indent
-      else:
-        s += "%s%s(sizeC >= static_cast<size_t>(%u" % (indent, ("if" if ruleIdx == 0 else "else if"), problemSize[0])
-        for i in range(1, problemType["NumIndicesC"]):
-          s += "*%u" % problemSize[i]
-        s += "))"
-      s += " return %s(" % solutionNames[winnerIdx]
-      # solution parameters
-      s += " dataC, dataA, dataB, alpha"
-      if problemType["UseBeta"]:
-        s += ", beta"
-      s += ", offsetC, offsetA, offsetB"
-      firstStride = 1
-      if problemType["UseInitialStrides"]:
-        firstStride = 0
-      lastStrideC = problemType["NumIndicesC"]
-      lastStrideA = len(problemType["IndexAssignmentsA"])
-      lastStrideB = len(problemType["IndexAssignmentsB"])
-
-      for i in range(firstStride,lastStrideC):
-        s += ", strideC%u%s" % (i, indexChars[i])
-      for i in range(firstStride,lastStrideA):
-        s += ", strideA%u%s" % (i, \
-            indexChars[problemType["IndexAssignmentsA"][i]])
-      for i in range(firstStride,lastStrideB):
-        s += ", strideB%u%s" % (i, \
-            indexChars[problemType["IndexAssignmentsB"][i]])
-      for i in range(0, problemType["TotalIndices"]):
-        s += ", size%s" % indexChars[i]
-      s += ", stream, numInputEvents, inputEvents, outputEvent ); /* [%f,%f] GFlops*/\n" % (minGFlops,maxGFlops)
-
-
+    logicStr = writeLogicRec(0, indexOrder, logic, solutionNames, problemType)
+    s += logicStr
     s += "\n}\n"
 
     # open and close individual files
@@ -289,6 +244,72 @@ def writeLogic(outputPath, logicList, solutionWriter ):
   logicHeaderFile.write(h)
   logicHeaderFile.close()
 
+################################################################################
+# Write Logic Recursive
+################################################################################
+def writeLogicRec(depth, indexOrder, logic, solutionNames, problemType):
+  indexChars = globalParameters["IndexChars"]
+  indent = "  "
+  indent += "  "*depth
+  s = ""
+  lowestLevel = depth == len(indexOrder)-1
+  numRules = len(logic)
+  for ruleIdx in range(0, numRules):
+    rule = logic[ruleIdx]
+    threshold = rule[0]
+    if lowestLevel:
+      solutionIdx = rule[1]
+      solutionCall = writeSolutionCall(solutionNames[solutionIdx],problemType)
+      if threshold > 0:
+        s += "%sif (size%s < %u) return %s;\n" \
+            % (indent, indexChars[indexOrder[depth]], threshold, solutionCall)
+      else:
+        s += "%sreturn %s;\n" % (indent, solutionCall)
+    else:
+      if threshold > 0:
+        s += "%sif (size%s < %u) {\n" \
+            % (indent, indexChars[indexOrder[depth]], threshold)
+      else:
+        s += "%s{\n" % (indent)
+      s += writeLogicRec(depth+1, indexOrder, rule[1], solutionNames, \
+          problemType)
+      s += "%s}\n" % (indent)
+  return s
+
+
+################################################################################
+# Write Solution Call
+################################################################################
+def writeSolutionCall(solutionName, problemType):
+  indexChars = globalParameters["IndexChars"]
+  s = ""
+  s += "%s(" % solutionName
+  # solution parameters
+  s += " dataC, dataA, dataB, alpha"
+  if problemType["UseBeta"]:
+    s += ", beta"
+  s += ", offsetC, offsetA, offsetB"
+  firstStride = 1
+  if problemType["UseInitialStrides"]:
+    firstStride = 0
+  lastStrideC = problemType["NumIndicesC"]
+  lastStrideA = len(problemType["IndexAssignmentsA"])
+  lastStrideB = len(problemType["IndexAssignmentsB"])
+  for i in range(firstStride,lastStrideC):
+    s += ", strideC%u%s" % (i, indexChars[i])
+  for i in range(firstStride,lastStrideA):
+    s += ", strideA%u%s" % (i, \
+        indexChars[problemType["IndexAssignmentsA"][i]])
+  for i in range(firstStride,lastStrideB):
+    s += ", strideB%u%s" % (i, \
+        indexChars[problemType["IndexAssignmentsB"][i]])
+  for i in range(0, problemType["TotalIndices"]):
+    s += ", size%s" % indexChars[i]
+  s += ", stream, numInputEvents, inputEvents, outputEvent )"
+  return s
+
+
+
 
 ################################################################################
 # Write CMake
@@ -412,11 +433,12 @@ def TensileCreateLibrary():
     printExit("LogicPath %s doesn't exist" % logicPath)
 
   logicFiles = [os.path.join(logicPath, f) for f in os.listdir(logicPath) \
-      if os.path.isfile(os.path.join(logicPath, f))]
+      if (os.path.isfile(os.path.join(logicPath, f)) \
+      and os.path.splitext(f)[1]==".yaml")]
 
-  print2("# LibraryLogicFiles:" % logicFiles)
+  print1("# LibraryLogicFiles:" % logicFiles)
   for logicFile in logicFiles:
-    print2("#   %s" % logicFile)
+    print1("#   %s" % logicFile)
 
   ##############################################################################
   # Parse config files
@@ -424,10 +446,10 @@ def TensileCreateLibrary():
   solutions = []
   logicList = []
   for logicFileName in logicFiles:
-    (scheduleName, problemType, solutionsForType, skinnyLogic0, skinnyLogic1, \
-        diagonalLogic) = YAMLIO.readLibraryLogicForProblemType(logicFileName)
+    (scheduleName, problemType, solutionsForType, indexOrder, logic) \
+        = YAMLIO.readLibraryLogicForProblemType(logicFileName)
     logicList.append((scheduleName, problemType, solutionsForType, \
-        skinnyLogic0, skinnyLogic1, diagonalLogic))
+        indexOrder, logic ))
     for solution in solutionsForType:
       if solution not in solutions:
         solutions.append(solution)
diff --git a/Tensile/YAMLIO.py b/Tensile/YAMLIO.py
index c0f7393b3..a806bd54e 100644
--- a/Tensile/YAMLIO.py
+++ b/Tensile/YAMLIO.py
@@ -85,27 +85,52 @@ def readSolutions( filename ):
 # 1 yaml per problem type
 # problemType, skinny0, skinny1, diagonal
 ################################################################################
-def writeLibraryLogicForProblemType( filePath, schedulePrefix, logic):
-  problemType   = logic[0]
-  solutions     = logic[1]
-  skinnyLogic0  = logic[2]
-  skinnyLogic1  = logic[3]
-  diagonalLogic = logic[4]
+def writeLibraryLogicForProblemType( filePath, schedulePrefix, logicTuple):
+  problemType   = logicTuple[0]
+  solutions     = logicTuple[1]
+  indexOrder    = logicTuple[2]
+  logic         = logicTuple[3]
   filename = os.path.join(filePath, "%s_%s.yaml" \
       % (schedulePrefix, str(problemType)))
   print2("# writeLogic( %s )" % ( filename ))
 
-  # open file
+  data = []
+  # logic name
+  data.append(globalParameters["Name"])
+  # problem type
+  problemTypeState = problemType.state
+  problemTypeState["DataType"] = \
+      problemTypeState["DataType"].value
+  data.append(problemTypeState)
+  # solutions
+  solutionList = []
+  for solution in solutions:
+    solutionState = solution.state
+    solutionState["ProblemType"] = solutionState["ProblemType"].state
+    solutionState["ProblemType"]["DataType"] = \
+        solutionState["ProblemType"]["DataType"].value
+    solutionList.append(solutionState)
+  data.append(solutionList)
+  # index order
+  data.append(indexOrder)
+  # logic
+  data.append(logic)
+
+  # open & write file
   try:
     stream = open(filename, "w")
+    #yaml.dump(data, stream, default_flow_style=False)
+    yaml.dump(data, stream)
+    stream.close()
   except IOError:
     printExit("Cannot open file: %s" % filename)
 
+  """
+  #data = [ globalParameters["Name"], problemTypeState, [], [], [] ]
   # write problem type
   problemTypeState = problemType.state
   problemTypeState["DataType"] = \
       problemTypeState["DataType"].value
-  data = [ globalParameters["Name"], problemTypeState, [], [], [], [] ]
   for solution in solutions:
     solutionState = solution.state
     solutionState["ProblemType"] = solutionState["ProblemType"].state
@@ -122,6 +147,7 @@ def writeLibraryLogicForProblemType( filePath, schedulePrefix, logic):
   #stream.write(data)
   yaml.dump(data, stream, default_flow_style=False)
   stream.close()
+  """
 
 
 def readLibraryLogicForProblemType( filename ):
@@ -134,19 +160,20 @@ def readLibraryLogicForProblemType( filename ):
   stream.close()
 
   # verify
-  if len(data) < 6:
+  if len(data) < 5:
     printExit("len(%s) %u < 6" % (filename, len(data)))
 
   # parse out objects
   scheduleName = data[0]
   problemTypeState = data[1]
   solutionStates = data[2]
-  skinnyLogic0 = data[3]
-  skinnyLogic1 = data[4]
-  diagonalLogic = data[5]
+  indexOrder = data[3]
+  logic = data[4]
 
-  solutions = []
+  # unpack problemType
   problemType = ProblemType(problemTypeState)
+  # unpack solutions
+  solutions = []
   for i in range(0, len(solutionStates)):
     solutionState = solutionStates[i]
     solutionObject = Solution(solutionState)
@@ -155,5 +182,4 @@ def readLibraryLogicForProblemType( filename ):
           % (problemType, solutionObject["ProblemType"]))
     solutions.append(solutionObject)
 
-  return (scheduleName, problemType, solutions, skinnyLogic0, skinnyLogic1, \
-      diagonalLogic)
+  return (scheduleName, problemType, solutions, indexOrder, logic )
diff --git a/Tensile/__init__.py b/Tensile/__init__.py
index 58039f505..8a124bf64 100644
--- a/Tensile/__init__.py
+++ b/Tensile/__init__.py
@@ -1 +1 @@
-__version__ = "2.1.1"
+__version__ = "2.2.0"