Optimize x86 pipeline on FirstFitPriority

krantikiran68 · Jan 5, 2021 · fbab6cb · fbab6cb
1 parent 8bdab94
commit fbab6cb
Show file tree

Hide file tree

Showing 5 changed files with 48 additions and 6 deletions.
diff --git a/tools/SeeDot/seedot/compiler/codegen/codegenBase.py b/tools/SeeDot/seedot/compiler/codegen/codegenBase.py
@@ -838,7 +838,7 @@ def sortkey(a):
             varToLiveRange.sort(key=sortkey, reverse=True)
             memAlloc = [(l * m // 8, i, j) for ([i, j], k, l, m) in varToLiveRange if k not in self.notScratch]
             varOrderAndSize = [(k, l * m // 8) for ([i, j], k, l, m) in varToLiveRange if k not in self.notScratch]
-            maxAllowedMemUsage = 200000
+            maxAllowedMemUsage = Config.memoryLimit
             timeout = 60
             bestCaseMemUsage = DLXInputGen.generateDLXInput(memAlloc, 1, 0, True)
             if maxAllowedMemUsage < bestCaseMemUsage:

diff --git a/tools/SeeDot/seedot/compiler/compiler.py b/tools/SeeDot/seedot/compiler/compiler.py
@@ -31,6 +31,8 @@
 
 import seedot.config as config
 
+import numpy as np
+
 # The Compiler class reads in the input code, converts it first into an AST, and subsequently into an IR which
 # contains a sequence of function calls (which are implemented by hand in a library). The IR is fed into the 
 # desired target codegen, which outputs the C/C++ code which can be run on the target device.
@@ -55,6 +57,7 @@ def __init__(self, algo, version, target, inputFile, outputDir, profileLogFile,
         self.generateAllFiles = generateAllFiles
         self.id = str(id) if id is not None else ""
         self.printSwitch = printSwitch
+        self.varSizes = {}
 
         self.intermediateScales = {}
         self.substitutions = substitutions
@@ -145,6 +148,14 @@ def genCodeWithFuncCalls(self, ast):
         # All state variables are used for codegen.
         state = [compiler.varDeclarations, compiler.varDeclarationsLocal, compiler.varScales, compiler.varIntervals, compiler.intConstants, compiler.expTables, compiler.globalVars, compiler.internalVars, compiler.floatConstants, compiler.substitutions, compiler.demotedVarsOffsets, compiler.varsForBitwidth, compiler.varLiveIntervals, compiler.notScratch, compiler.coLocatedVariables]
 
+        for key in compiler.varDeclarations.keys():
+            val = compiler.varDeclarations[key]
+            if type.isTensor(val):
+                dims = val.shape
+                self.varSizes[key] = np.prod(dims)
+            else:
+                self.varSizes[key] = 1
+
         # Raw live ranges do not capture the scope of the first/last usage of a variable, so they require post-processing.
         state[12] = self.adjustLiveRanges(state[12], compiler.allDepths)
 

diff --git a/tools/SeeDot/seedot/config.py b/tools/SeeDot/seedot/config.py
@@ -6,9 +6,11 @@
 availableBitwidths = [8, 16, 32]
 
 # Range of max scale factor used for exploration.
+# In the old SeeDot (PLDI'19), this explores across the maxscale parameter.
+# In the new SeeDot (OOPSLA'20), this explores across the scale of the input variable 'X'.
 maxScaleRange = 0, -wordLength
 
-# TanH approximation limit.
+# TanH approximation limit. Used by old SeeDot (PLDI'19).
 tanhLimit = 1.0
 
 # MSBuild location
@@ -18,7 +20,7 @@
                       r"C:\Program Files (x86)\Microsoft Visual Studio\2019\Professional\MSBuild\Current\Bin\MSBuild.exe"
                       ]
 
-# Not supported (ddsEnabled = False and vbwEnabled = True).
+# IMPORTANT NOTE: Unsupported configuration (ddsEnabled = False and vbwEnabled = True).
 
 # Enable data-driven scale computation. Turning this to False reverts the compiler to old verion (PLDI'19).
 ddsEnabled = True
@@ -43,6 +45,12 @@
 # Number of offsets tried out for each variable (except X, for which 9 are tried) when they are demoted to 8 bits one at a time.
 offsetsPerDemotedVariable = 3
 
+# For a classification algorithm, fixed point code can have this much drop in accuracy compared to floating point code. Not used in regression algorithms.
+permittedClassificationAccuracyLoss = 2.0
+
+# For a regression algorithm, fixed point code can have this much more numerical loss compared to floating point code. Not used in classification algorithms.
+permittedRegressionNumericalLossMargin = 90.0
+
 # Following classes are used sanity checks for arguments passed to the compiler, to prevent unexpected arguments being passed.
 # These lists should be updated as the compiler is expanded to multiple algorithms and datasets.
 

diff --git a/tools/SeeDot/seedot/main.py b/tools/SeeDot/seedot/main.py
@@ -84,6 +84,8 @@ def __init__(self, algo, version, target, trainingFile, testingFile, modelDir, s
         self.biasShifts = {}
             # For simplifying bias addition, populated after every code run, used for M3 codegen.
             # In operations like WX + B, B is mostly used once in the code. So all the fixed point computations are clubbed into one.
+        self.varSizes = {}
+            # Map from a variable to number of elements it holds. Populated in floating point mode.
 
     # This function is invoked right at the beginning for moving around files into the working directory.
     def setup(self):
@@ -181,6 +183,7 @@ def compile(self, version, target, sf, generateAllFiles=True, id=None, printSwit
         if version == config.Version.floatt:
             self.variableSubstitutions = obj.substitutions
             self.variableToBitwidthMap = dict.fromkeys(obj.independentVars, config.wordLength)
+            self.varSizes = obj.varSizes
 
         self.problemType = obj.problemType
         if id is None:
@@ -495,7 +498,23 @@ def performSearch(self):
                 totalSize = len(self.varDemoteDetails)
                 numBatches = int(np.ceil(totalSize / redBatchSize))
 
-                sortedVars = [i for (i, j) in self.varDemoteDetails]
+                sortedVars1 = []
+                sortedVars2 = []
+                for ((demoteVars, offset), _) in self.varDemoteDetails:
+                    variableInMap = False
+                    for demoteVar in demoteVars:
+                        if demoteVar in self.varSizes:
+                            variableInMap = True
+                            if self.varSizes[demoteVar] >= Util.Config.largeVariableLimit:
+                                sortedVars1.append((demoteVars, offset))
+                                break
+                            else:
+                                sortedVars2.append((demoteVars, offset))
+                                break
+                    if not variableInMap:
+                        sortedVars2.append((demoteVars, offset))
+
+                sortedVars = sortedVars1 + sortedVars2
 
                 self.varDemoteDetails = []
                 demotedVarsOffsets = dict(self.demotedVarsOffsets)
@@ -544,9 +563,9 @@ def performSearch(self):
                 acceptedAcc = lastStageAcc
                 for ((demotedVars, _), metrics) in self.varDemoteDetails:
                     acc = metrics[0]
-                    if self.problemType == config.ProblemType.classification and (self.flAccuracy - acc) > 2.0:
+                    if self.problemType == config.ProblemType.classification and (self.flAccuracy - acc) > config.permittedClassificationAccuracyLoss:
                         break
-                    elif self.problemType == config.ProblemType.regression and acc > 90.0:
+                    elif self.problemType == config.ProblemType.regression and acc > config.permittedRegressionNumericalLossMargin:
                         break
                     else:
                         okToDemote = demotedVars

diff --git a/tools/SeeDot/seedot/util.py b/tools/SeeDot/seedot/util.py
@@ -31,6 +31,10 @@ class Config:
         #   -> Similarly in multiplication-like functions convolution, hadamard product etc.
     x86MemoryOptimize = True
         # Enable memory optimization in the generated fixed-point code in x86, arduino or m3 codegen.
+    memoryLimit = 200000
+        # The maximum memory present on the target device. Used if memory optimizations are enabled in the target codegen.
+    largeVariableLimit = 50000
+        # Any variable with more elements than this are prioritized for demotion to 8 bits.
     defragmentEnabled = False
         # Enable defragmentation. Currently not supported, so must be kept to False.
     faceDetectionHacks = False