diff --git a/Tensile/BenchmarkProblems.py b/Tensile/BenchmarkProblems.py index 8ec4ed719..62d9867b4 100644 --- a/Tensile/BenchmarkProblems.py +++ b/Tensile/BenchmarkProblems.py @@ -37,7 +37,7 @@ def benchmarkProblemType( config ): totalBenchmarkSteps = len(benchmarkProcess) winners = WinningParameterDict() - print1("NumBenchmarkSteps: %u" % totalBenchmarkSteps) + print1("# NumBenchmarkSteps: %u" % totalBenchmarkSteps) print1("") print1(HR) print1("# Done Creating BenchmarkProcess Object") @@ -148,6 +148,7 @@ def benchmarkProblemType( config ): # Enumerate Solutions = Hardcoded * Benchmark ############################################################################ sys.stdout.write("# Enumerating Solutions") + solutionSet = set() # avoid duplicates for nlca=-1, 1 for hardcodedIdx in range(0, numHardcoded): solutions.append([]) hardcodedParamDict = benchmarkStep.hardcodedParameters[hardcodedIdx] @@ -169,10 +170,15 @@ def benchmarkProblemType( config ): benchmarkStep.initialSolutionParameters[initialSolutionParameterName] # TODO check if solution matches problem size for exact tile kernels solutionObject = Solution(solution) - if SolutionWriter.solutionParametersConsistent(solutionObject): - solutions[hardcodedIdx].append(solutionObject) - if globalParameters["PrintLevel"] >= 1: - sys.stdout.write("|") + if solutionObject["Valid"]: + if solutionObject not in solutionSet: + solutionSet.add(solutionObject) + solutions[hardcodedIdx].append(solutionObject) + if globalParameters["PrintLevel"] >= 1: + sys.stdout.write("|") + else: + if globalParameters["PrintLevel"] >= 1: + sys.stdout.write(":") else: if globalParameters["PrintLevel"] >= 1: sys.stdout.write(".") @@ -541,8 +547,8 @@ def get( lookupHardcodedParameters, winners ): #for paramName in hardcodedFrozen: # paramValue = hardcodedFrozen[paramName] # matchUnion[paramName] = paramValue - Solution.assignDimsFromEdgeAndShape(matchUnion) - Solution.assignDimsFromEdgeAndShape(hardcodedFrozen.parameters) + Solution.assignProblemIndependentDerivedParameters(matchUnion) + Solution.assignProblemIndependentDerivedParameters(hardcodedFrozen.parameters) if matchUnion["MacroTile0"] != lookupMacroTile0 \ or matchUnion["MacroTile1"] != lookupMacroTile1: matchMacroTile = False diff --git a/Tensile/BenchmarkStructs.py b/Tensile/BenchmarkStructs.py index b119001ae..48bbf9813 100644 --- a/Tensile/BenchmarkStructs.py +++ b/Tensile/BenchmarkStructs.py @@ -2,7 +2,7 @@ # from copy import * from copy import copy, deepcopy -from Common import print1, print2, printWarning, defaultSolution, defaultProblemSizes, defaultBenchmarkFinalProblemSizes, defaultBenchmarkCommonParameters, hasParam, defaultBenchmarkJoinParameters, getParamValues, defaultForkParameters, defaultBenchmarkForkParameters, defaultJoinParameters, printExit +from Common import print1, print2, printWarning, defaultSolution, defaultProblemSizes, defaultBenchmarkFinalProblemSizes, defaultBenchmarkCommonParameters, hasParam, defaultBenchmarkJoinParameters, getParamValues, defaultForkParameters, defaultBenchmarkForkParameters, defaultJoinParameters, printExit, globalParameters from SolutionStructs import Solution, ProblemType, ProblemSizes ################################################################################ @@ -399,14 +399,16 @@ def convertParametersToSteps(self): macroTileDim0 = workGroupEdgeValues[workGroupEdgeIdx]*threadTileEdgeValues[threadTileEdgeIdx] macroTileDim1 = macroTileDim0 if workGroupShapeValues[workGroupShapeIdx] < 0: - macroTileDim1 /= 2 + macroTileDim0 *= abs(workGroupShapeValues[workGroupShapeIdx]) elif workGroupShapeValues[workGroupShapeIdx] > 0: - macroTileDim1 *= 2 + macroTileDim1 *= abs(workGroupShapeValues[workGroupShapeIdx]) if threadTileShapeValues[threadTileShapeIdx] < 0: - macroTileDim1 /= 2 + macroTileDim0 *= abs(threadTileShapeValues[threadTileShapeIdx]) elif threadTileShapeValues[threadTileShapeIdx] > 0: - macroTileDim1 *= 2 - if macroTileDim0/macroTileDim1 <= self.initialSolutionParameters["MacroTileMaxRatio"] and macroTileDim1/macroTileDim0 <= self.initialSolutionParameters["MacroTileMaxRatio"]: + macroTileDim1 *= abs(threadTileShapeValues[threadTileShapeIdx]) + # TODO is this still useful? + if macroTileDim0/macroTileDim1 <= globalParameters["MaxMacroTileRatio"] \ + and macroTileDim1/macroTileDim0 <= globalParameters["MaxMacroTileRatio"]: macroTileJoinSet.add((macroTileDim0, macroTileDim1)) totalPermutations *=len(macroTileJoinSet) print2("JoinMacroTileSet(%u): %s" % (len(macroTileJoinSet), macroTileJoinSet) ) diff --git a/Tensile/ClientWriter.py b/Tensile/ClientWriter.py index f3289dcc8..de391a40e 100644 --- a/Tensile/ClientWriter.py +++ b/Tensile/ClientWriter.py @@ -1,4 +1,4 @@ -from Common import globalParameters, HR, pushWorkingPath, popWorkingPath, print1, CHeader, printExit +from Common import globalParameters, HR, pushWorkingPath, popWorkingPath, print1, CHeader, printExit, printWarning from SolutionStructs import Solution from SolutionWriter import SolutionWriter import YAMLIO @@ -54,13 +54,14 @@ def main( config ): ############################################################################## logicFiles = [os.path.join(libraryLogicPath, f) for f \ in os.listdir(libraryLogicPath) \ - if os.path.isfile(os.path.join(libraryLogicPath, f))] + if (os.path.isfile(os.path.join(libraryLogicPath, f)) \ + and os.path.splitext(f)[1]==".yaml")] print1("LogicFiles: %s" % logicFiles) functions = [] functionNames = [] for logicFileName in logicFiles: - (scheduleName, problemType, solutionsForType, skinnyLogic0, skinnyLogic1, \ - diagonalLogic) = YAMLIO.readLibraryLogicForProblemType(logicFileName) + (scheduleName, problemType, solutionsForType, indexOrder, logic) \ + = YAMLIO.readLibraryLogicForProblemType(logicFileName) functions.append((scheduleName, problemType)) functionNames.append("tensile_%s_%s" % (scheduleName, problemType)) @@ -139,15 +140,22 @@ def writeRunScript(path, libraryLogicPath, forBenchmark): runScriptFile.write("cmake --build . --config %s%s\n" \ % (globalParameters["CMakeBuildType"], " -- -j 8" \ if os.name != "nt" else "") ) - #if os.name != "nt": - # runScriptFile.write("find .\n") - runScriptFile.write("%s & echo %s & echo # Running Client & echo %s\n" \ - % (echoLine, HR, HR)) - if os.name == "nt": - runScriptFile.write(os.path.join(globalParameters["CMakeBuildType"], \ - "client.exe") ) + if forBenchmark: + if os.name == "nt": + runScriptFile.write(os.path.join(globalParameters["CMakeBuildType"], \ + "client.exe") ) + else: + runScriptFile.write("./client") else: - runScriptFile.write("./client") + executablePath = os.path.join(globalParameters["WorkingPath"]) + if os.name == "nt": + executablePath = os.path.join(executablePath, \ + globalParameters["CMakeBuildType"], \ + "client.exe") + else: + executablePath = os.path.join(executablePath, "client") + runScriptFile.write("%s & echo %s & echo # Library Client Path: & echo %s\n" \ + % (echoLine, HR, executablePath) ) runScriptFile.close() if os.name != "nt": os.chmod(runScriptName, 0777) diff --git a/Tensile/Common.py b/Tensile/Common.py index 8576813b2..2e3eb3151 100644 --- a/Tensile/Common.py +++ b/Tensile/Common.py @@ -49,8 +49,10 @@ globalParameters["DataInitType"] = 0 # 0=rand, 1=1, 2=serial # protect against invalid kernel globalParameters["MaxThreads"] = 256 -globalParameters["MaxRegisters"] = 256 +globalParameters["MinThreads"] = 64 globalParameters["MaxLDS"] = 32768 +globalParameters["MaxMacroTileRatio"] = 4 +globalParameters["MaxThreadTile"] = 64 ################################################################################ @@ -67,15 +69,17 @@ {"EdgeType": [ "Branch" ] }, # Shift {"EdgeMultiKernel": [ False ] }, {"PadLDS": [ 1 ] }, + {"SplitU": [ 1 ] }, + {"Prefetch": [ False ] }, ] # benchmark these solution independently defaultForkParameters = [ {"WorkGroupEdge": [ 16, 8 ] }, - {"WorkGroupShape": [ 0 ] }, # -1, 0, 1 + {"WorkGroupShape": [ 0 ] }, # -4, -2, 0, 2, 4 {"ThreadTileEdge": [ 1, 2, 4, 6, 8 ] }, - {"ThreadTileShape": [ 0 ] }, # -1, 0, 1 - {"SplitU": [ 1 ] }, - {"Prefetch": [ False ] }, + {"ThreadTileShape": [ 0 ] }, # -4, -2, 0, 2, 4 + {"NumLoadsCoalescedA": [ 1, -1 ] }, + {"NumLoadsCoalescedB": [ 1, -1 ] }, ] # keep one winner per solution and it affects which will win defaultBenchmarkForkParameters = [ @@ -88,18 +92,17 @@ ] # keep one winner per solution and it would affect which solutions fastest defaultBenchmarkJoinParameters = [ - {"NumLoadsCoalescedA": [ 1, 2, 3, 4, 6, 8 ] }, - {"NumLoadsCoalescedB": [ 1, 2, 3, 4, 6, 8 ] }, {"VectorWidthGlobalLoad": [ 4 ] }, {"VectorWidthGlobalStore": [ 4 ] }, {"VectorWidthLocalLoad": [ 4 ] }, {"VectorWidthLocalStore": [ 4 ] }, ] -# derrived parameters may show up in solution dict but don't use for naming -derrivedParameters = [ +# derived parameters may show up in solution dict but don't use for naming +derivedParameters = [ "MacroTile0", "MacroTile1", + "DepthU", "WorkGroup0", "WorkGroup1", "ThreadTile0", @@ -108,6 +111,7 @@ "NumLoadsB", "NumLoadsPerpendicularA", "NumLoadsPerpendicularB", + "NumThreads", ] # dictionary of defaults comprised for 1st option for each parameter @@ -118,7 +122,6 @@ for key, value in paramDict.iteritems(): defaultSolution[key] = value[0] # other non-benchmark options for solutions -defaultSolution["MacroTileMaxRatio"] = 2 ################################################################################ # Default Problem Type @@ -149,8 +152,10 @@ # Default Analysis Parameters ################################################################################ defaultAnalysisParameters = { - "Dilation": 3, - "Threshold": 0.1, + "InitialSolutionWindow": 4, + "BranchPenalty": 100, # microseconds / kernel + "SmoothOutliers": False, # enforce monotonic data + "SolutionImportanceMin": 0.01, # = 1% } diff --git a/Tensile/Configs/rocblas_cgemm.yaml b/Tensile/Configs/rocblas_cgemm.yaml index b1c5e2257..40e68b3a2 100644 --- a/Tensile/Configs/rocblas_cgemm.yaml +++ b/Tensile/Configs/rocblas_cgemm.yaml @@ -41,8 +41,8 @@ BenchmarkProblems: - ThreadTileShape: [ 0 ] - WorkGroupEdge: [8, 16] - ThreadTileEdge: [2, 4, 6, 8] - - NumLoadsCoalescedA: [ 1, 2, 3, 4, 6, 8 ] - - NumLoadsCoalescedB: [ 1, 2, 3, 4, 6, 8 ] + - NumLoadsCoalescedA: [ 1, -1 ] + - NumLoadsCoalescedB: [ 1, -1 ] - LoopUnroll: [8, 16] BenchmarkForkParameters: JoinParameters: @@ -74,8 +74,8 @@ BenchmarkProblems: - WorkGroupShape: [ 0 ] - ThreadTileEdge: [2, 4, 6, 8] - ThreadTileShape: [ 0 ] - - NumLoadsCoalescedA: [ 1, 2, 3, 4, 6, 8 ] - - NumLoadsCoalescedB: [ 1, 2, 3, 4, 6, 8 ] + - NumLoadsCoalescedA: [ 1, -1 ] + - NumLoadsCoalescedB: [ 1, -1 ] - LoopUnroll: [8, 16] BenchmarkForkParameters: JoinParameters: @@ -107,8 +107,8 @@ BenchmarkProblems: - WorkGroupShape: [ 0 ] - ThreadTileEdge: [2, 4, 6, 8] - ThreadTileShape: [ 0 ] - - NumLoadsCoalescedA: [ 1, 2, 3, 4, 6, 8 ] - - NumLoadsCoalescedB: [ 1, 2, 3, 4, 6, 8 ] + - NumLoadsCoalescedA: [ 1, -1 ] + - NumLoadsCoalescedB: [ 1, -1 ] - LoopUnroll: [8, 16] BenchmarkForkParameters: JoinParameters: @@ -140,8 +140,8 @@ BenchmarkProblems: - WorkGroupShape: [ 0 ] - ThreadTileEdge: [2, 4, 6, 8] - ThreadTileShape: [ 0 ] - - NumLoadsCoalescedA: [ 1, 2, 3, 4, 6, 8 ] - - NumLoadsCoalescedB: [ 1, 2, 3, 4, 6, 8 ] + - NumLoadsCoalescedA: [ 1, -1 ] + - NumLoadsCoalescedB: [ 1, -1 ] - LoopUnroll: [8, 16] BenchmarkForkParameters: JoinParameters: @@ -151,7 +151,6 @@ BenchmarkProblems: - ProblemSizes: [ [32, 32, 32, 4000], [32, 32, 32, 4000], [2], [1536] ] LibraryLogic: - Dilation: 3 - Threshold: 0.1 + BranchPenalty: 0 LibraryClient: diff --git a/Tensile/Configs/rocblas_dgemm.yaml b/Tensile/Configs/rocblas_dgemm.yaml index 36e358135..51e8eea11 100644 --- a/Tensile/Configs/rocblas_dgemm.yaml +++ b/Tensile/Configs/rocblas_dgemm.yaml @@ -41,8 +41,8 @@ BenchmarkProblems: - ThreadTileShape: [ 0 ] - WorkGroupEdge: [8, 16] - ThreadTileEdge: [2, 4, 6, 8] - - NumLoadsCoalescedA: [ 1, 2, 3, 4, 6, 8 ] - - NumLoadsCoalescedB: [ 1, 2, 3, 4, 6, 8 ] + - NumLoadsCoalescedA: [ 1, -1 ] + - NumLoadsCoalescedB: [ 1, -1 ] - LoopUnroll: [8, 16] BenchmarkForkParameters: JoinParameters: @@ -74,8 +74,8 @@ BenchmarkProblems: - WorkGroupShape: [ 0 ] - ThreadTileEdge: [2, 4, 6, 8] - ThreadTileShape: [ 0 ] - - NumLoadsCoalescedA: [ 1, 2, 3, 4, 6, 8 ] - - NumLoadsCoalescedB: [ 1, 2, 3, 4, 6, 8 ] + - NumLoadsCoalescedA: [ 1, -1 ] + - NumLoadsCoalescedB: [ 1, -1 ] - LoopUnroll: [8, 16] BenchmarkForkParameters: JoinParameters: @@ -107,8 +107,8 @@ BenchmarkProblems: - WorkGroupShape: [ 0 ] - ThreadTileEdge: [2, 4, 6, 8] - ThreadTileShape: [ 0, 1 ] - - NumLoadsCoalescedA: [ 1, 2, 3, 4, 6, 8 ] - - NumLoadsCoalescedB: [ 1, 2, 3, 4, 6, 8 ] + - NumLoadsCoalescedA: [ 1, -1 ] + - NumLoadsCoalescedB: [ 1, -1 ] - LoopUnroll: [8, 16] BenchmarkForkParameters: JoinParameters: @@ -140,8 +140,8 @@ BenchmarkProblems: - WorkGroupShape: [ 0 ] - ThreadTileEdge: [2, 4, 6, 8] - ThreadTileShape: [ 0 ] - - NumLoadsCoalescedA: [ 1, 2, 3, 4, 6, 8 ] - - NumLoadsCoalescedB: [ 1, 2, 3, 4, 6, 8 ] + - NumLoadsCoalescedA: [ 1, -1 ] + - NumLoadsCoalescedB: [ 1, -1 ] - LoopUnroll: [8, 16] BenchmarkForkParameters: JoinParameters: @@ -151,7 +151,6 @@ BenchmarkProblems: - ProblemSizes: [ [32, 32, 32, 4000], [32, 32, 32, 4000], [2], [1536] ] LibraryLogic: - Dilation: 3 - Threshold: 0.1 + BranchPenalty: 0 LibraryClient: diff --git a/Tensile/Configs/rocblas_sgemm.yaml b/Tensile/Configs/rocblas_sgemm.yaml index 346bf1cb7..c4ed34214 100644 --- a/Tensile/Configs/rocblas_sgemm.yaml +++ b/Tensile/Configs/rocblas_sgemm.yaml @@ -41,8 +41,8 @@ BenchmarkProblems: - ThreadTileShape: [ 0 ] - WorkGroupEdge: [8, 16] - ThreadTileEdge: [2, 4, 6, 8] - - NumLoadsCoalescedA: [ 1, 2, 3, 4, 6, 8 ] - - NumLoadsCoalescedB: [ 1, 2, 3, 4, 6, 8 ] + - NumLoadsCoalescedA: [ 1, -1 ] + - NumLoadsCoalescedB: [ 1, -1 ] - LoopUnroll: [8, 16] BenchmarkForkParameters: JoinParameters: @@ -74,8 +74,8 @@ BenchmarkProblems: - WorkGroupShape: [ 0 ] - ThreadTileEdge: [2, 4, 6, 8] - ThreadTileShape: [ 0 ] - - NumLoadsCoalescedA: [ 1, 2, 3, 4, 6, 8 ] - - NumLoadsCoalescedB: [ 1, 2, 3, 4, 6, 8 ] + - NumLoadsCoalescedA: [ 1, -1 ] + - NumLoadsCoalescedB: [ 1, -1 ] - LoopUnroll: [8, 16] BenchmarkForkParameters: JoinParameters: @@ -107,8 +107,8 @@ BenchmarkProblems: - WorkGroupShape: [ 0 ] - ThreadTileEdge: [2, 4, 6, 8] - ThreadTileShape: [ 0, 1 ] - - NumLoadsCoalescedA: [ 1, 2, 3, 4, 6, 8 ] - - NumLoadsCoalescedB: [ 1, 2, 3, 4, 6, 8 ] + - NumLoadsCoalescedA: [ 1, -1 ] + - NumLoadsCoalescedB: [ 1, -1 ] - LoopUnroll: [8, 16] BenchmarkForkParameters: JoinParameters: @@ -140,8 +140,8 @@ BenchmarkProblems: - WorkGroupShape: [ 0 ] - ThreadTileEdge: [2, 4, 6, 8] - ThreadTileShape: [ 0 ] - - NumLoadsCoalescedA: [ 1, 2, 3, 4, 6, 8 ] - - NumLoadsCoalescedB: [ 1, 2, 3, 4, 6, 8 ] + - NumLoadsCoalescedA: [ 1, -1 ] + - NumLoadsCoalescedB: [ 1, -1 ] - LoopUnroll: [8, 16] BenchmarkForkParameters: JoinParameters: @@ -151,7 +151,6 @@ BenchmarkProblems: - ProblemSizes: [ [32, 32, 32, 4000], [32, 32, 32, 4000], [2], [2880] ] LibraryLogic: - Dilation: 3 - Threshold: 0.1 + BranchPenalty: 0 LibraryClient: diff --git a/Tensile/Configs/rocblas_zgemm.yaml b/Tensile/Configs/rocblas_zgemm.yaml index badbd3dfb..4082ed6c8 100644 --- a/Tensile/Configs/rocblas_zgemm.yaml +++ b/Tensile/Configs/rocblas_zgemm.yaml @@ -41,8 +41,8 @@ BenchmarkProblems: - ThreadTileShape: [ 0 ] - WorkGroupEdge: [8, 16] - ThreadTileEdge: [2, 4, 6, 8] - - NumLoadsCoalescedA: [ 1, 2, 3, 4, 6, 8 ] - - NumLoadsCoalescedB: [ 1, 2, 3, 4, 6, 8 ] + - NumLoadsCoalescedA: [ 1, -1 ] + - NumLoadsCoalescedB: [ 1, -1 ] - LoopUnroll: [8, 16] BenchmarkForkParameters: JoinParameters: @@ -74,8 +74,8 @@ BenchmarkProblems: - WorkGroupShape: [ 0 ] - ThreadTileEdge: [2, 4, 6, 8] - ThreadTileShape: [ 0 ] - - NumLoadsCoalescedA: [ 1, 2, 3, 4, 6, 8 ] - - NumLoadsCoalescedB: [ 1, 2, 3, 4, 6, 8 ] + - NumLoadsCoalescedA: [ 1, -1 ] + - NumLoadsCoalescedB: [ 1, -1 ] - LoopUnroll: [8, 16] BenchmarkForkParameters: JoinParameters: @@ -107,8 +107,8 @@ BenchmarkProblems: - WorkGroupShape: [ 0 ] - ThreadTileEdge: [2, 4, 6, 8] - ThreadTileShape: [ 0 ] - - NumLoadsCoalescedA: [ 1, 2, 3, 4, 6, 8 ] - - NumLoadsCoalescedB: [ 1, 2, 3, 4, 6, 8 ] + - NumLoadsCoalescedA: [ 1, -1 ] + - NumLoadsCoalescedB: [ 1, -1 ] - LoopUnroll: [8, 16] BenchmarkForkParameters: JoinParameters: @@ -140,8 +140,8 @@ BenchmarkProblems: - WorkGroupShape: [ 0 ] - ThreadTileEdge: [2, 4, 6, 8] - ThreadTileShape: [ 0 ] - - NumLoadsCoalescedA: [ 1, 2, 3, 4, 6, 8 ] - - NumLoadsCoalescedB: [ 1, 2, 3, 4, 6, 8 ] + - NumLoadsCoalescedA: [ 1, -1 ] + - NumLoadsCoalescedB: [ 1, -1 ] - LoopUnroll: [8, 16] BenchmarkForkParameters: JoinParameters: @@ -151,7 +151,6 @@ BenchmarkProblems: - ProblemSizes: [ [32, 32, 32, 4000], [32, 32, 32, 4000], [2], [1536] ] LibraryLogic: - Dilation: 3 - Threshold: 0.1 + BranchPenalty: 0 LibraryClient: diff --git a/Tensile/Configs/sgemm.yaml b/Tensile/Configs/sgemm.yaml index 49e6d13c7..f9ddad0c5 100644 --- a/Tensile/Configs/sgemm.yaml +++ b/Tensile/Configs/sgemm.yaml @@ -40,19 +40,15 @@ BenchmarkProblems: ForkParameters: - WorkGroupEdge: [8, 16] - ThreadTileEdge: [2, 4, 8 ] - BenchmarkForkParameters: - - NumLoadsCoalescedA: [ 1, 2, 3, 4, 6 ] - - NumLoadsCoalescedB: [ 1, 2, 3, 4, 6 ] - LoopUnroll: [8, 16] + BenchmarkForkParameters: JoinParameters: - MacroTile - - DepthU BenchmarkJoinParameters: BenchmarkFinalParameters: - ProblemSizes: [ [16, 128], [16, 128], [256] ] LibraryLogic: - Dilation: 3 - Threshold: 0.1 + BranchPenalty: 0 LibraryClient: diff --git a/Tensile/Configs/sgemm_5760.yaml b/Tensile/Configs/sgemm_5760.yaml index d8f077b85..7c167a650 100644 --- a/Tensile/Configs/sgemm_5760.yaml +++ b/Tensile/Configs/sgemm_5760.yaml @@ -39,8 +39,6 @@ BenchmarkProblems: - ThreadTileShape: [0, 1] - WorkGroupEdge: [8, 16] - ThreadTileEdge: [ 8 ] - - NumLoadsCoalescedA: [ 1, 2, 3, 4, 6, 8 ] - - NumLoadsCoalescedB: [ 1, 2, 3, 4, 6, 8 ] - LoopUnroll: [8, 16] #BenchmarkForkParameters: JoinParameters: @@ -71,8 +69,6 @@ BenchmarkProblems: - ThreadTileShape: [0, 1] - WorkGroupEdge: [8, 16] - ThreadTileEdge: [4, 6, 8] - - NumLoadsCoalescedA: [ 1, 2, 3, 4, 5, 6 ] - - NumLoadsCoalescedB: [ 1, 2, 3, 4, 5, 6 ] - LoopUnroll: [8, 16] #BenchmarkForkParameters: JoinParameters: @@ -103,8 +99,6 @@ BenchmarkProblems: - ThreadTileShape: [0, 1] - WorkGroupEdge: [8, 16] - ThreadTileEdge: [4, 6, 8] - - NumLoadsCoalescedA: [ 1, 2, 3, 4, 5, 6 ] - - NumLoadsCoalescedB: [ 1, 2, 3, 4, 5, 6 ] - LoopUnroll: [8, 16] #BenchmarkForkParameters: JoinParameters: @@ -135,8 +129,6 @@ BenchmarkProblems: - ThreadTileShape: [0, 1] - WorkGroupEdge: [8, 16] - ThreadTileEdge: [4, 6, 8] - - NumLoadsCoalescedA: [ 1, 2, 3, 4, 5, 6 ] - - NumLoadsCoalescedB: [ 1, 2, 3, 4, 5, 6 ] - LoopUnroll: [8, 16] #BenchmarkForkParameters: JoinParameters: diff --git a/Tensile/Configs/tensor_contraction.yaml b/Tensile/Configs/tensor_contraction.yaml index dd44124d7..aa5e779f4 100644 --- a/Tensile/Configs/tensor_contraction.yaml +++ b/Tensile/Configs/tensor_contraction.yaml @@ -40,10 +40,8 @@ BenchmarkProblems: ForkParameters: - WorkGroupEdge: [8, 16] - ThreadTileEdge: [2, 4, 8 ] - BenchmarkForkParameters: - - NumLoadsCoalescedA: [ 1, 2, 3, 4, 6 ] - - NumLoadsCoalescedB: [ 1, 2, 3, 4, 6 ] - LoopUnroll: [8, 16] + BenchmarkForkParameters: JoinParameters: - MacroTile - DepthU @@ -52,7 +50,6 @@ BenchmarkProblems: - ProblemSizes: [ [16, 128], [16, 128], [2, 2, 4], [256] ] LibraryLogic: - Dilation: 3 - Threshold: 0.1 + BranchPenalty: 0 LibraryClient: diff --git a/Tensile/LibraryLogic.py b/Tensile/LibraryLogic.py index 881631f2b..823d48452 100644 --- a/Tensile/LibraryLogic.py +++ b/Tensile/LibraryLogic.py @@ -2,6 +2,7 @@ import os.path import array import csv +from sys import stdout from copy import deepcopy @@ -12,7 +13,7 @@ ################################################################################ # Analyze Problem Type ################################################################################ -def analyzeProblemType( problemTypeTuple, analysisParameters ): +def analyzeProblemType( problemTypeTuple, inputParameters ): problemType = problemTypeTuple[0] problemSizes = problemTypeTuple[1] dataFileName = problemTypeTuple[2] @@ -20,9 +21,7 @@ def analyzeProblemType( problemTypeTuple, analysisParameters ): print2(HR) print1("# %s" % problemType) - #print "# %s" % dataFileName - #print "# %s" % solutionsFileName - + ###################################### # Read Solutions (problemSizes, solutions) = YAMLIO.readSolutions(solutionsFileName) print2("# ProblemSizes: %s" % problemSizes) @@ -30,165 +29,89 @@ def analyzeProblemType( problemTypeTuple, analysisParameters ): print2("# Solutions:") solutionIdx = 0 for solution in solutions: - print2("# (%u) %s" % (solutionIdx, Solution.getNameMin(solution, solutionMinNaming))) + print2("# (%u) %s" % (solutionIdx, Solution.getNameMin(solution, \ + solutionMinNaming))) solutionIdx += 1 print2(HR) + ###################################### # Read Data From CSV - #numProblemSizes = problemSizes.numProblemSizes - data = BenchmarkDataAnalyzer(problemType, problemSizes, solutions, \ - analysisParameters) - data.populateFromCSV(dataFileName) - - ############################################################################## - # Determine Solutions Along Diagonal - # roughly same splitting regardless of sizeU - problemIndices = [] - for numProblemsForIndex in data.numProblemSizes: - problemIndices.append(numProblemsForIndex-1) - diagonalRules = data.getFastestSolutionsAlongDiagonal(problemIndices) - if True: - print2("Diagonal Rules:") - for rule in diagonalRules: - string = " if freeSize >=%4u" % data.problemIndexToSize[0][rule[1][0]] - for i in range(1, data.numIndices): - string += "x%4u" % data.problemIndexToSize[i][rule[1][i]] - string += " return S[%u] @ %5.0f-%5.0f>%5.0f GFlops is %s" \ - % (rule[0], rule[2], rule[3], rule[4], \ - data.solutionNames[rule[0]]) - print2(string) - - ############################################################################## - # Determine Skinny0 Solutions - skinnyRules01 = data.getSkinnySolutions(diagonalRules, problemIndices, \ - data.idx0, data.idx1) - #print "Skinny Rules:" - #for rule in skinnyRules01: - # string = " if freeSize >=%4u" % data.problemIndexToSize[0][rule[1][0]] - # for i in range(1, data.numIndices): - # string += "x%4u" % data.problemIndexToSize[i][rule[1][i]] - # string += " return S[%u] @ %5.0f-%5.0f>%5.0f GFlops is %s" \ - # % (rule[0], rule[2], rule[3], rule[4], \ - # data.solutionNames[rule[0]]) - - ############################################################################## - # Determine Skinny1 Solutions - skinnyRules10 = data.getSkinnySolutions(diagonalRules, problemIndices, \ - data.idx1, data.idx0) - - # list solutions that actually get used - solutionIndicesUsed = [] - for rule in skinnyRules01: - pass - for rule in skinnyRules10: - pass - for rule in diagonalRules: - solutionIdx = rule[0] - solution = solutions[solutionIdx] - MT0 = solution["MacroTile0"] - MT1 = solution["MacroTile1"] - DU = solution["DepthU"] - #print "Rule Tile S[%u]: %ux%ux%u" % (solutionIdx, MT0, MT1, DU) - # is this solution in the list - inList = False - for solutionUsed in solutionIndicesUsed: - if solutionUsed[0] == solutionIdx: - inList = True - break - if not inList: - insertIdx = len(solutionIndicesUsed) - for i in range(0, len(solutionIndicesUsed)): - iMT0 = solutionIndicesUsed[i][1] - iMT1 = solutionIndicesUsed[i][2] - iDU = solutionIndicesUsed[i][3] - #print " compare S[%u]: %ux%ux%u" % (solutionIndicesUsed[i][0], \ - # iMT0, iMT1, iDU) - if MT0*MT1 < iMT0*iMT1: - insertIdx = i - break - elif MT0*MT1 > iMT0*iMT1: - continue - else: # MT == MT - if DU < iDU: - insertIdx = i - break - else: - continue - - # if i'm smaller than i, insert me before i - #print "insert: %u" % insertIdx - solutionIndicesUsed.insert(insertIdx, [solutionIdx, MT0, MT1, DU]) - #print solutionIndicesUsed - - # list of solutions used - solutionsUsed = [] - for solutionIndexUsed in solutionIndicesUsed: - solutionsUsed.append(solutions[solutionIndexUsed[0]]) - - # translate rules to new solution indices - for rule in skinnyRules01: - pass - for rule in skinnyRules10: - pass - for ruleIdx in range(0, len(diagonalRules)): - solutionIdx = diagonalRules[ruleIdx][0] - for i in range(0, len(solutionIndicesUsed)): - solutionIndexUsed = solutionIndicesUsed[i] - if solutionIdx == solutionIndexUsed[0]: - diagonalRules[ruleIdx][0] = i - break - # change problemSizeIndices to sizes - for i in range(0, 3): - diagonalRules[ruleIdx][1][i] = \ - data.problemIndexToSize[i][ diagonalRules[ruleIdx][1][i] ] - - print2("# New Rules: %s" % diagonalRules) - + logicAnalyzer = LogicAnalyzer( \ + problemType, problemSizes, solutions, inputParameters) + logicAnalyzer.populateFromCSV(dataFileName) + + ###################################### + # Remove invalid solutions + logicAnalyzer.removeInvalidSolutions() + + ###################################### + # Remove least important solutions + logicAnalyzer.removeLeastImportantSolutions() + + ###################################### + # Correct outliers + if inputParameters["SmoothOutliers"]: + logicAnalyzer.smoothOutliers() + #logicAnalyzer.print2D([0, 0]) + + ###################################### + # Create Rules + logic = logicAnalyzer.enRule(0, logicAnalyzer.globalIndexRange) + print2("# Final Logic:") + print2(logic) + logicComplexity = [0]*logicAnalyzer.numIndices + logicAnalyzer.scoreLogicComplexity(logic, logicComplexity) + print2("Logic Complexity: %s" % logicComplexity) + score = logicAnalyzer.scoreRangeForLogic( \ + logicAnalyzer.globalIndexRange, logic) + print1("\nScore: %.0f ms" % (score/1000)) + + logicAnalyzer.prepareLogic(logic) #return (skinnyRules01, skinnyRules10, diagonalRules) - return (problemType, solutionsUsed, [], [], diagonalRules ) + #return (problemType, logicAnalyzer.solutionsUsed, [], [], logicAnalyzer.diagonalRules ) + return (problemType, logicAnalyzer.solutions, logicAnalyzer.indexOrder, logic) ################################################################################ -# BenchmarkDataAnalyzer +# LogicAnalyzer ################################################################################ -class BenchmarkDataAnalyzer: - - ######################################## - # diagonal rule looks like - # 0: solutionIdx - # 1: problemIndices for minThreshold problem - # 2: gflops at above minSize - # 3: maxGFlops for this solution along diagonal in interval it won - # 4: gflops of prior winner at minSize, i.e., what performance did it beat - - ######################################## - # skinny rule looks like - # 0: solutionIdx - # 1: problemIndices for minThreshold problem - # 2: gflops at above minSize +class LogicAnalyzer: + ############################################################################## + ############################################################################## + ### + ### Entry / Top-Level Functions + ### + ############################################################################## + ############################################################################## - def __init__(self, problemType, problemSizes, solutions, analysisParameters): + ############################################################################## + # ENTRY: Init + ############################################################################## + def __init__(self, problemType, problemSizes, solutions, inputParameters): self.problemType = problemType self.problemSizes = problemSizes - self.analysisParameters = analysisParameters + self.parameters = inputParameters print2("ProblemSizes: %s" % self.problemSizes) # TODO verify that data is symmetric for diagonal #if self.problemSizes[self.problemType["Index0"]] \ # != self.problemSizes[self.problemType["Index1"]]: # printExit("d0 / d1 must be symmetric for analysis.") self.numProblemSizes = problemSizes.numProblemSizes # native order - print2("NumProblemSizes: %s" % self.numProblemSizes) + print1("NumProblemSizes: %s" % self.numProblemSizes) self.numIndices = len(self.numProblemSizes) self.solutions = solutions self.numSolutions = len(self.solutions) self.solutionMinNaming = Solution.getMinNaming(solutions) self.solutionNames = [] + self.solutionTiles = [] for solution in self.solutions: self.solutionNames.append(Solution.getNameMin(solution, \ self.solutionMinNaming)) + self.solutionTiles.append("%ux%u"%(solution["MacroTile0"], solution["MacroTile1"])) + self.flopsPerMac = self.problemType["DataType"].flopsPerMac() # special indices self.idx0 = self.problemType["Index0"] @@ -229,13 +152,23 @@ def __init__(self, problemType, problemSizes, solutions, analysisParameters): currentSize += currentStride currentStride += index[2] idx += 1 - #print "S->I %s" % self.problemSizeToIndex - #print "I->S %s" % self.problemIndexToSize + self.rangeIndicesFree = range(0, self.problemType["NumIndicesC"]) + self.rangeIndicesSummation = range(self.problemType["NumIndicesC"], \ + self.problemType["TotalIndices"]) + self.indexOrder = self.recommendedIndexOrder() + print2("IndexOrder: %s" % self.indexOrder) + self.globalIndexRange = [] + for i in range(0, self.numIndices): + self.globalIndexRange.append([0, self.numProblemSizes[i]]) + self.problemIndicesForGlobalRange \ + = self.problemIndicesForRange(self.globalIndexRange) + self.tab = [""]*self.numIndices ############################################################################## - # Read In CSV + # ENTRY: Read In CSV + ############################################################################## def populateFromCSV(self, dataFileName): # open file @@ -271,10 +204,6 @@ def populateFromCSV(self, dataFileName): for i in range(0, self.numIndices): problemIndices.append(self.problemSizeToIndex[i][problemSize[i]]) serialIdx = self.indicesToSerial(0, problemIndices) - #print "%s -> %s -> %u" % (problemSize, problemIndices, serialIdx) - - # total size - #totalFlops = float(row[totalSizeIdx]) # data solutionIdx = 0 @@ -288,295 +217,707 @@ def populateFromCSV(self, dataFileName): ############################################################################## - # Get Fastest Solutions Along Diagonal (d0=d1) for largest sizes + # ENTRY: Remove Invalid Solutions + ############################################################################## + def removeInvalidSolutions(self): + #problemIndices = [0]*self.numIndices + allSolutionValid = False + while not allSolutionValid: + moreProblems = True + invalidIdx = -1 + for problemIndices in self.problemIndicesForGlobalRange: + problemSerial = self.indicesToSerial(0, problemIndices) + for solutionIdx in range(0, self.numSolutions): + gflops = self.data[problemSerial+solutionIdx] + if gflops == 0: + invalidIdx = solutionIdx + break + if invalidIdx >= 0: + print1("# Removing Invalid Solution: %u %s" \ + % (invalidIdx, self.solutionNames[invalidIdx]) ) + self.removeSolution(invalidIdx) + else: + allSolutionValid = True + + + ############################################################################## + # ENTRY: Remove Least Important Solutions + ############################################################################## + def removeLeastImportantSolutions(self): + # Remove least important solutions + while True: + (lisIdx, lisPercSaved, lisPercWins, lisPercExec) \ + = self.leastImportantSolution() + if lisPercSaved < self.parameters["SolutionImportanceMin"]: + self.removeSolution(lisIdx) + continue + else: + break + + + ############################################################################## + # ENTRY: Smooth Outliers ############################################################################## - def getFastestSolutionsAlongDiagonal(self, problemIndices): - print2("\nFastest Diagonal idxU: %u" % problemIndices[self.idxU]) - # abstract to multidimensions - # what is the diagonal - dilation = self.analysisParameters["Dilation"] - threshold = self.analysisParameters["Threshold"] - numProblems0 = self.numProblemSizes[self.idx0] + def smoothOutliers(self): + problemSizes = [0]*self.numIndices + for problemIndices in self.problemIndicesForGlobalRange: + problemSerial = self.indicesToSerial(0, problemIndices) - ############################################################################ - # determine winner at largest size - solutionNumWins = [0]*self.numSolutions - solutionGFlops = [0]*self.numSolutions - for problemSizeIdx in range(max(0,numProblems0-dilation*2), numProblems0): - problemIndices[self.idx0] = problemSizeIdx - problemIndices[self.idx1] = problemSizeIdx - problemIdx = self.indicesToSerial(0, problemIndices) - winnerIdx = -1 - winnerGFlops = -1 for solutionIdx in range(0, self.numSolutions): - solutionSerialIdx = problemIdx + solutionIdx - solutionTmpGFlops = self.data[solutionSerialIdx] - if solutionTmpGFlops > winnerGFlops: - winnerIdx = solutionIdx - winnerGFlops = solutionTmpGFlops - #print "updated winner: ", winnerIdx - #print winnerIdx - solutionNumWins[winnerIdx] += 1 - if winnerGFlops > solutionGFlops[winnerIdx]: - solutionGFlops[winnerIdx] = winnerGFlops - largestWinnerIdx = -1 - largestWinnerNumWins = -1 - largestWinnerGFlops = -1 - #print "FastestWins:" + gflops = self.data[problemSerial+solutionIdx] + neighborGFlops = [] + smoothProblem = False + for iIdx in range(0, self.numIndices): + if problemIndices[iIdx] > 0 \ + and problemIndices[iIdx] < self.numProblemSizes[iIdx]-1: + neighborBeforeIndices = deepcopy(problemIndices) + neighborAfterIndices = deepcopy(problemIndices) + neighborBeforeIndices[iIdx] -= 1 + neighborAfterIndices[iIdx] += 1 + neighborBeforeIdx = self.indicesToSerial(0, neighborBeforeIndices) + neighborAfterIdx = self.indicesToSerial(0, neighborAfterIndices) + neighborBeforeGFlops = self.data[neighborBeforeIdx+solutionIdx] + neighborAfterGFlops = self.data[neighborAfterIdx+solutionIdx] + neighborGFlops.append(neighborBeforeGFlops) + neighborGFlops.append(neighborAfterGFlops) + if neighborBeforeGFlops > gflops \ + and neighborAfterGFlops < gflops : + smoothProblem = True + if smoothProblem: + s = "" + for i in range(0, self.numIndices): + problemSizes[i] = self.problemIndexToSize[i][problemIndices[i]] + s += "%u, " % problemSizes[i] + new = sum(neighborGFlops)/len(neighborGFlops) + old = self.data[problemSerial+solutionIdx] + s += "%f -> %f" % (old, new) + self.data[problemSerial+solutionIdx] \ + = sum(neighborGFlops)/len(neighborGFlops) + + + ############################################################################## + # ENTRY: En Rule + # currentIndexIndex = 0, 1, 2, 3... + # currentIndexRange will have only 1 size for prior indices (unless initial) + # + # Rule: + # [128, [ + # [64, [ + # [16, 0], + # [2880,1] + # ] + # ], + # [96, [ + # [16, 0], + # [64, 1] + # ] + # ] + # ] + # ], another + # + # + ############################################################################## + def enRule(self, currentIndexIndex, currentIndexRange): + cii = currentIndexIndex + if currentIndexIndex == 0: + self.tab[cii] = "[] " + elif currentIndexIndex == 1: + self.tab[cii] = "[%2u] " % ( \ + currentIndexRange[self.indexOrder[0]][0]) + elif currentIndexIndex == 2: + self.tab[cii] = "[%2u,%2u] " % ( \ + currentIndexRange[self.indexOrder[0]][0], \ + currentIndexRange[self.indexOrder[1]][0]) + elif currentIndexIndex == 3: + self.tab[cii] = "[%2u,%2u,%2u] " % ( \ + currentIndexRange[self.indexOrder[0]][0], \ + currentIndexRange[self.indexOrder[1]][0], \ + currentIndexRange[self.indexOrder[2]][0]) + elif currentIndexIndex == 4: + self.tab[cii] = "[%2u,%2u,%2u,%2u] " % ( \ + currentIndexRange[self.indexOrder[0]][0], \ + currentIndexRange[self.indexOrder[1]][0], \ + currentIndexRange[self.indexOrder[2]][0], \ + currentIndexRange[self.indexOrder[3]][0]) + tab = self.tab[cii] + if globalParameters["PrintLevel"] == 1: + stdout.write("\n%s"%tab) + currentIndex = self.indexOrder[currentIndexIndex] + print2("%senRule(%s)" % (tab, currentIndexRange)) + nextIndexIndex = currentIndexIndex+1 + nextIndexRange = deepcopy(currentIndexRange) + isLastIndex = currentIndexIndex == self.numIndices-1 + ruleList = [] + + ######################################## + # if there's only 1 problem size here + ######################################## + if currentIndexRange[currentIndex][1] \ + - currentIndexRange[currentIndex][0] == 1: + + ######################################## + # this is last index, so just return fastest solution + if isLastIndex: + # TODO optimize b/c this should be only single problem + #scores = self.scoreRangeForSolutions(currentIndexRange) + #winnerIdx = 0 + #for solutionIdx in range(1, self.numSolution): + # if scores[solutionIdx] < scores[winnerIdx]: + # winnerIdx = solutionIdx + winnerIdx = self.winnerForRange(currentIndexRange) + #print2("%sreturning early winner=%u" % (tab, winnerIdx)) + ruleList.append(-1) + ruleList.append(winnerIdx) + if globalParameters["PrintLevel"] == 1: + stdout.write("#") + + ######################################## + # this isn't last index, so just recursively return next index + else: + #print2("%sreturning early enRule(%s)" \ + # % (tab, nextIndexRange) ) + rule = [ -1, self.enRule(nextIndexIndex, nextIndexRange) ] + ruleList.append(rule) + if globalParameters["PrintLevel"] == 1: + stdout.write("#") + + ######################################## + # full iterative rule list + ######################################## + else: + + ######################################## + # create initial rule + initialSize = min(currentIndexRange[currentIndex][0] \ + + self.parameters["InitialSolutionWindow"], \ + self.numProblemSizes[currentIndex]) + nextIndexRange[currentIndex][1] = initialSize + if isLastIndex: + winnerIdx = self.winnerForRange(nextIndexRange) + initialRule = [ currentIndexRange[currentIndex][0], winnerIdx] + else: + #print2("%sinitialRule(%s)" % (tab, nextIndexRange)) + initialRule = [ currentIndexRange[currentIndex][0], \ + self.enRule(nextIndexIndex, nextIndexRange) ] + #print2("%sinitialRule(%s) DONE" % (tab, nextIndexRange)) + ruleList.append(initialRule) + if globalParameters["PrintLevel"] == 1: + stdout.write("#") + + ######################################## + # for all problem indices in this index + for problemIndex in range(currentIndexRange[currentIndex][0]+1, \ + currentIndexRange[currentIndex][1]): + nextIndexRange[currentIndex][0] = problemIndex + nextIndexRange[currentIndex][1] = problemIndex+1 + priorRule = ruleList[len(ruleList)-1] + priorRuleForSize = deepcopy(priorRule) + priorRuleForSize[0] = problemIndex + + if isLastIndex: + winnerIdx = self.winnerForRange(nextIndexRange) + candidateRule = [ problemIndex, winnerIdx] + else: + candidateRule = [ problemIndex, self.enRule(nextIndexIndex, \ + nextIndexRange) ] + + ######################################## + # candidate same as prior + if candidateRule[1] == priorRule[1]: + #print2("%sP[%2u]: same" % (tab, problemIndex)) + ruleList[len(ruleList)-1][0] = problemIndex + if globalParameters["PrintLevel"] == 1: + stdout.write(" ") + continue + + ######################################## + # compare candidate vs prior + else: + #print2("%sScoring P:%s for Prior=%s, Cand=%s" \ + # % ( tab, nextIndexRange, priorRuleForSize, candidateRule)) + # score prior + priorRuleScore = self.scoreRangeForLogic(nextIndexRange, \ + [priorRuleForSize]) + logicComplexity = [0]*self.numIndices + self.scoreLogicComplexity( \ + [priorRuleForSize], logicComplexity) + priorRuleScore += self.parameters["BranchPenalty"] \ + * sum(logicComplexity) + # score candidate + candidateRuleScore = self.scoreRangeForLogic(nextIndexRange, \ + [candidateRule]) + logicComplexity = [0]*self.numIndices + self.scoreLogicComplexity( \ + [candidateRule], logicComplexity) + candidateRuleScore += self.parameters["BranchPenalty"] \ + * sum(logicComplexity) + candidateRuleScore += self.parameters["BranchPenalty"] # penalize + candidateFaster = candidateRuleScore < priorRuleScore + print2("%sP[%2u]: %s %s~%.0fus < %s~%.0fus" % (tab, problemIndex, \ + "wins" if candidateFaster else "same", \ + candidateRule, candidateRuleScore, priorRuleForSize, \ + priorRuleScore )) + + ######################################## + # candidate wins + if candidateRuleScore < priorRuleScore: + ruleList.append(candidateRule) + if globalParameters["PrintLevel"] == 1: + stdout.write("#") + + ######################################## + # prior wins + else: + if globalParameters["PrintLevel"] == 1: + stdout.write(".") + ruleList[len(ruleList)-1][0] = problemIndex + + #print2("%sReturning RuleList: %s" % (tab, ruleList)) + return ruleList + + + + ############################################################################## + ############################################################################## + ### + ### Mid-Level Functions + ### + ############################################################################## + ############################################################################## + + + + ############################################################################## + # Prepare Logic + # convert threshold indices to sizes + # last threshold = -1 + ############################################################################## + def prepareLogic(self, logic): + depth = self.getLogicDepth(logic) + if depth == 0: return + indexIndex = self.numIndices - depth + index = self.indexOrder[indexIndex] + for i in range(0, len(logic)): + if i == len(logic)-1: + logic[i][0] = -1 + else: + logic[i][0] = self.problemIndexToSize[index][logic[i][0]] + self.prepareLogic(logic[i][1]) + + + ############################################################################## + # Print2D + ############################################################################## + def print2D(self, indices ): + indicesIdx = 0 + problemIndices = [] + for i in range(0, self.numIndices): + if i == self.idx0: + problemIndices.append(-1) + elif i == self.idx1: + problemIndices.append(-1) + else: + problemIndices.append(indices[indicesIdx]) + indicesIdx += 1 + + winnerIndices = [] + w = "winner" + g = "gflops" + f = "faster" + s = "second" + sss = [] + for sIdx in range(0, self.numSolutions): + sss.append("Sol[%u]" % sIdx) + for j in range(0, self.numProblemSizes[1]): + w += ",%4u" % self.problemIndexToSize[1][j] + g += ",%4u" % self.problemIndexToSize[1][j] + f += ",%4u" % self.problemIndexToSize[1][j] + s += ",%4u" % self.problemIndexToSize[1][j] + for sIdx in range(0, self.numSolutions): + sss[sIdx] += ",%4u" % self.problemIndexToSize[1][j] + w += "\n" + g += "\n" + f += "\n" + s += "\n" + for sIdx in range(0, self.numSolutions): + sss[sIdx] += "\n" + for i in range(0, self.numProblemSizes[0]): + problemIndices[self.idx0] = i + w += "%4u" % self.problemIndexToSize[0][i] + g += "%4u" % self.problemIndexToSize[0][i] + f += "%4u" % self.problemIndexToSize[0][i] + s += "%4u" % self.problemIndexToSize[0][i] + for sIdx in range(0, self.numSolutions): + sss[sIdx] += "%4u" % self.problemIndexToSize[0][i] + for j in range(0, self.numProblemSizes[1]): + problemIndices[self.idx1] = j + problemSerial = self.indicesToSerial(0, problemIndices) + for sIdx in range(0, self.numSolutions): + sss[sIdx] += ",%f" % self.data[problemSerial+sIdx] + + if self.data[problemSerial+0] > self.data[problemSerial+1]: + winnerIdx = 0 + winnerGFlops = self.data[problemSerial+0] + secondIdx = 1 + secondGFlops = self.data[problemSerial+1] + else: + winnerIdx = 1 + winnerGFlops = self.data[problemSerial+1] + secondIdx = 0 + secondGFlops = self.data[problemSerial+0] + for solutionIdx in range(2, self.numSolutions): + solutionSerialIdx = problemSerial + solutionIdx + solutionGFlops = self.data[solutionSerialIdx] + if solutionGFlops > winnerGFlops: + secondIdx = winnerIdx + secondGFlops = winnerGFlops + winnerIdx = solutionIdx + winnerGFlops = solutionGFlops + + + if winnerIdx not in winnerIndices: + winnerIndices.append(winnerIdx) + w += ",%4u" % winnerIdx + g += ",%f" % winnerGFlops + f += ",%f" % (winnerGFlops/secondGFlops) + s += ",%4u" % (secondIdx) + w += "\n" + g += "\n" + f += "\n" + s += "\n" + for sIdx in range(0, self.numSolutions): + sss[sIdx] += "\n" + + w += "\n\n" + g += "\n\n" + f += "\n\n" + s += "\n\n" + for sIdx in range(0, self.numSolutions): + sss[sIdx] += "\n\n" + w += "Winners:\n" + for winnerIdx in winnerIndices: + w += "%4u, %s, %s\n" % (winnerIdx, self.solutionTiles[winnerIdx], self.solutionNames[winnerIdx]) + + printFileName = "Winner2D" + for idx in indices: + printFileName += "_%u" % idx + printFileName += ".csv" + printFile = open(os.path.join(globalParameters["WorkingPath"], printFileName), "w") + printFile.write( w ) + printFile.write( g ) + printFile.write( f ) + printFile.write( s ) + for sIdx in range(0, self.numSolutions): + printFile.write( sss[sIdx] ) + printFile.close() + + + ############################################################################## + # Least Important Solution + ############################################################################## + def leastImportantSolution(self): + solutionImportance = [] for i in range(0, self.numSolutions): - #print "sol[%u] = %u wins @ %.0f GFlops" \ - # % (i, solutionNumWins[i], solutionGFlops[i]) - if solutionNumWins[i] > largestWinnerNumWins: - largestWinnerIdx = i - largestWinnerNumWins = solutionNumWins[i] - largestWinnerGFlops = solutionGFlops[i] - #print "Winner at Largest Problem: S[%u] @ %.0f GFlops with %u/%u wins" \ - # % (largestWinnerIdx, largestWinnerGFlops, largestWinnerNumWins, \ - # dilation*2) - problemIndices[self.idx0] = numProblems0-1 - problemIndices[self.idx1] = numProblems0-1 - largestWinnerAtLargestProblemIdx = self.indicesToSerial(largestWinnerIdx, \ - problemIndices) - largestWinnerGFlopsAtLargestSize = \ - self.data[largestWinnerAtLargestProblemIdx] - - ############################################################################ - # Diagonal Rule - # solutionIdx, minSizeThresholdIdx, gflops at minSize, maxGFlops, oldGFlops - numRules = 1 - diagonalRules = [ [largestWinnerIdx, deepcopy(problemIndices), \ - largestWinnerGFlopsAtLargestSize, largestWinnerGFlops, -1] ] - - ############################################################################ - # For largest to smallest, determine fastest solution - for problemSizeIdx in range(numProblems0-2, -1, -1): - problemIndices[self.idx0] = problemSizeIdx - problemIndices[self.idx1] = problemSizeIdx - problemIdx = self.indicesToSerial(0, problemIndices) - - # current rule winner performance at this problemSizeIdx - ruleWinnerIdx = diagonalRules[-1][0] - ruleWinnerGFlopsForSize = self.data[problemIdx + ruleWinnerIdx] - - #determine fastest at this problemSizeIdx - (winnerForSizeIdx, winnerForSizeGFlops) = \ - self.getWinnerForProblem( problemIndices ) - - # ruleWinner also wins at this problem size (at least by threshold) - if winnerForSizeIdx == ruleWinnerIdx \ - or ruleWinnerGFlopsForSize > (1-threshold)*winnerForSizeGFlops: - # just update rule - diagonalRules[numRules-1][1] = deepcopy(problemIndices) - diagonalRules[numRules-1][2] = ruleWinnerGFlopsForSize - diagonalRules[numRules-1][3] = max(diagonalRules[numRules-1][3], \ - ruleWinnerGFlopsForSize) - - # we have a new candidate winner - # only keep it if don't revert back to ruleWinner over next Dilation + solutionImportance.append([i, 0, 0, 0]) + problemSizes = [0]*self.numIndices + totalSavedMs = 0 + totalExecMs = 0 + totalWins = 0 + for problemIndices in self.problemIndicesForGlobalRange: + for i in range(0, self.numIndices): + problemSizes[i] = self.problemIndexToSize[i][problemIndices[i]] + totalFlops = self.flopsPerMac + for size in problemSizes: + totalFlops *= size + + problemSerial = self.indicesToSerial(0, problemIndices) + if self.data[problemSerial+0] > self.data[problemSerial+1]: + winnerIdx = 0 + winnerGFlops = self.data[problemSerial+0] + secondIdx = 1 + secondGFlops = self.data[problemSerial+1] else: + winnerIdx = 1 + winnerGFlops = self.data[problemSerial+1] + secondIdx = 0 + secondGFlops = self.data[problemSerial+0] + + for solutionIdx in range(2, self.numSolutions): + solutionSerialIdx = problemSerial + solutionIdx + solutionGFlops = self.data[solutionSerialIdx] + if solutionGFlops > winnerGFlops: + secondIdx = winnerIdx + secondGFlops = winnerGFlops + winnerIdx = solutionIdx + winnerGFlops = solutionGFlops + winnerTimeMs = totalFlops / winnerGFlops / 1000000 + secondTimeMs = totalFlops / secondGFlops / 1000000 + solutionImportance[winnerIdx][1] += (secondTimeMs - winnerTimeMs) + solutionImportance[winnerIdx][2] += 1 + solutionImportance[winnerIdx][3] += winnerTimeMs + + totalSavedMs += secondTimeMs - winnerTimeMs + totalExecMs += winnerTimeMs + totalWins += 1 + solutionImportance.sort(key=lambda x: x[1]) + return ( solutionImportance[0][0], \ + solutionImportance[0][1] / totalSavedMs, \ + solutionImportance[0][2] / totalWins, \ + solutionImportance[0][3] / totalExecMs ) - # check if we don't revert back to ruleWinner over next Dilation probs - revert = False - endDilationIdx = max(-1, problemSizeIdx-dilation) - for dilationSizeIdx in range(problemSizeIdx-1, \ - endDilationIdx, -1): - problemIndices[self.idx0] = dilationSizeIdx - problemIndices[self.idx1] = dilationSizeIdx - dilationIdx = self.indicesToSerial(0, problemIndices) - ruleWinnerGFlopsForDilation = self.data[dilationIdx \ - + ruleWinnerIdx] - #determine fastest at this problemSizeIdx - (winnerForDilationIdx, winnerForDilationGFlops) = \ - self.getWinnerForProblem(problemIndices) - - # ruleWinner also wins at dilation size (at least by threshold) - if winnerForDilationIdx == ruleWinnerIdx \ - or ruleWinnerGFlopsForDilation \ - > (1-threshold)*winnerForSizeGFlops: - # yes, within Dilation, we've returned to same winner - revert = True - # so update rule for this size - diagonalRules[numRules-1][1] = deepcopy(problemIndices) - diagonalRules[numRules-1][2] = winnerForDilationGFlops - diagonalRules[numRules-1][3] = max(diagonalRules[numRules-1][3], \ - winnerForSizeGFlops) - # resume outer loop after dilation - problemSizeIdx = dilationSizeIdx + + ############################################################################## + # Remove Solution + ############################################################################## + def removeSolution(self, removeSolutionIdx): + + # temporarily move current to old + oldSolutions = self.solutions + oldNumSolutions = self.numSolutions + oldData = self.data + oldTotalSize = self.totalSize + + # update solutions + self.solutions = [] + for i in range(0, oldNumSolutions): + if i != removeSolutionIdx: + self.solutions.append(oldSolutions[i]) + self.solutionMinNaming = Solution.getMinNaming(self.solutions) + self.solutionNames = [] + self.solutionTiles = [] + for solution in self.solutions: + self.solutionNames.append(Solution.getNameMin(solution, \ + self.solutionMinNaming)) + self.solutionTiles.append("%ux%u"%(solution["MacroTile0"], \ + solution["MacroTile1"])) + self.numSolutions = len(self.solutions) + + # update data + self.totalSize = self.totalProblems * self.numSolutions + self.data = array.array('f', [0]*self.totalSize) + for problemIndex in range(0, self.totalProblems): + newSolutionIdx = 0 + for oldSolutionIdx in range(0, oldNumSolutions): + if oldSolutionIdx != removeSolutionIdx: + self.data[problemIndex*self.numSolutions+newSolutionIdx] \ + = oldData[problemIndex*oldNumSolutions+oldSolutionIdx] + newSolutionIdx += 1 + + + ############################################################################## + # Score Range For Logic + ############################################################################## + def scoreRangeForLogic(self, indexRange, logic): + depth = self.getLogicDepth(logic) + depth = self.numIndices - depth + fullLogic = deepcopy(logic) + for i in range(0, depth): + fullLogic = [[-1, fullLogic]] + fullLogic = fullLogic + return self.scoreRangeForFullLogic(depth, indexRange, fullLogic) + + ############################################################################## + # Score Range For Full Logic + ############################################################################## + def scoreRangeForFullLogic(self, depth, indexRange, logic): + score = 0 + for problemIndices in self.problemIndicesForRange(indexRange): + problemSerial = self.indicesToSerial(0, problemIndices) + totalFlops = self.totalFlopsForProblemIndices(problemIndices) + solutionIdx = self.getSolutionForProblemIndicesUsingLogic( \ + problemIndices, logic) + gflops = self.data[problemSerial + solutionIdx] + timeUs = totalFlops / gflops / 1000 + score += timeUs + return score + + ############################################################################## + # Get Solution For Problem Indices Using Logic + ############################################################################## + def getSolutionForProblemIndicesUsingLogic(self, problemIndices, logic): + currentProblemIndices = self.toIndexOrder(problemIndices) + currentLogic = logic + for i in range(0, self.numIndices): + currentSizeIndex = currentProblemIndices[0] + for j in range(0, len(currentLogic)): + if currentLogic[j][0] < 0: + currentProblemIndices = currentProblemIndices[1:] + currentLogic = currentLogic[j][1] + break + if currentLogic[j][0] >= 0: + if currentSizeIndex <= currentLogic[j][0]: + currentProblemIndices = currentProblemIndices[1:] + currentLogic = currentLogic[j][1] break - else: - # different winner at this dilation size - # don't need to do anything - pass - - # if we never revert to rule during dilation, create new rule - if not revert: - # solutionIdx, minSizeThresholdIdx, gflops at minSize, maxGFlops, old - newRule = [ winnerForSizeIdx, deepcopy(problemIndices), \ - winnerForSizeGFlops, winnerForSizeGFlops, ruleWinnerGFlopsForSize] - diagonalRules.append(newRule) - numRules += 1 - #print "Added new rule: %s" % newRule - - return diagonalRules - #end diagonal rules - - - ############################################################################## - # Skinny Solutions - ############################################################################## - def getSkinnySolutions(self, diagonalRules, problemIndices, \ - idxLarge, idxSmall): - idx0 = self.idx0 - idx1 = self.idx1 - #idxU = self.idxU - #dilation = self.analysisParameters["Dilation"] - threshold = self.analysisParameters["Threshold"] - - skinnyRules = [] - - # for each size threshold along diagonal - for diagonalRuleIdx in range(0, len(diagonalRules)): - diagonalRule = diagonalRules[diagonalRuleIdx] - diagonalRuleWinnerIdx = diagonalRule[0] - diagonalRuleThresholdProblem = diagonalRule[1] - #diagonalRuleGFlops = diagonalRule[2] # perf at threshold - thresholdSizeFree = self.getSizeFree(diagonalRuleThresholdProblem) - print2("ThresholdSizeFree[%u][%u]: %u" \ - % (diagonalRuleThresholdProblem[idx0], \ - diagonalRuleThresholdProblem[idx1], \ - thresholdSizeFree)) - - # check skinny d0< thresholdSizeFree: - #print "SkinnySizeFree[%u][%u]: %u" % (sizeIdxSmall, sizeIdxLarge, \ - # skinnySizeFree) - - # rule winner's performance at this skinnyness - skinnyProblemIdx = self.indicesToSerial(0, skinnyProblemIndices) - diagonalWinnerGFlopsForSkinny = self.data[skinnyProblemIdx \ - + diagonalRuleWinnerIdx] - - # which solution wins here? - (winnerIdx, winnerGFlops) = \ - self.getWinnerForProblem(skinnyProblemIndices) - #print winnerIdx, winnerGFlops - if winnerIdx == diagonalRuleWinnerIdx \ - or diagonalWinnerGFlopsForSkinny > (1-threshold)*winnerGFlops: - # diagonal rule also wins here - print2("if dS <%5u and dL >%5u diagnl S[%2u] %5.0f == S[%2u] %5.0f GFlops" \ - % (self.problemIndexToSize[idxSmall][sizeIdxSmall], \ - self.problemIndexToSize[idxLarge][sizeIdxLarge], \ - winnerIdx, winnerGFlops, diagonalRuleWinnerIdx, \ - diagonalWinnerGFlopsForSkinny )) - pass - else: - # we're so skinny that diagonal rule no longer applies - print2("if dS <%5u and dL >%5u skinny S[%2u] %5.0f >> S[%2u] %5.0f GFlops" \ - % (self.problemIndexToSize[idxSmall][sizeIdxSmall], \ - self.problemIndexToSize[idxLarge][sizeIdxLarge], \ - winnerIdx, winnerGFlops, diagonalRuleWinnerIdx, \ - diagonalWinnerGFlopsForSkinny )) - skinnyRule = [deepcopy(skinnyProblemIndices), winnerIdx, \ - winnerGFlops] - skinnyRules.append(skinnyRule) - # TODO need to use dilate parameter to make sure we've switched - # TODO data along this size may not agree with - # data along different sizes (but perhaps it should - # TODO need extra loop here, to iterate idxSmall to - # smaller sizes to see if the solution changes further - - # does the diagonalRuleWinner also win here? - break # only check the problem size closest to ruleSize - - return skinnyRules - # end skinny solutions - - ############################################################################## - # Get Size Free and Summation + return currentLogic + + + ############################################################################## + ############################################################################## + ### + ### Helper / Low-Level Functions + ### + ############################################################################## + ############################################################################## + + ############################################################################## + # Get Winner For Problem def getWinnerForProblem(self, problemIndices): - problemIdx = self.indicesToSerial(0, problemIndices) + problemSerial = self.indicesToSerial(0, problemIndices) winnerIdx = -1 winnerGFlops = -1 for solutionIdx in range(0, self.numSolutions): - solutionSerialIdx = problemIdx + solutionIdx + solutionSerialIdx = problemSerial + solutionIdx solutionGFlops = self.data[solutionSerialIdx] if solutionGFlops > winnerGFlops: - #print "%f > %f" % (solutionGFlops, winnerGFlops) winnerIdx = solutionIdx winnerGFlops = solutionGFlops return (winnerIdx, winnerGFlops) ############################################################################## - # Get Size Free and Summation + # Winner For Range + def winnerForRange(self, indexRange): + scores = self.scoreRangeForSolutions(indexRange) + winnerIdx = 0 + for solutionIdx in range(1, self.numSolutions): + if scores[solutionIdx] < scores[winnerIdx]: + winnerIdx = solutionIdx + return winnerIdx + + ############################################################################## + # Score (microseconds) Range For Solutions + def scoreRangeForSolutions(self, indexRange): + scores = [0]*self.numSolutions + for problemIndices in self.problemIndicesForRange(indexRange): + problemSerial = self.indicesToSerial(0, problemIndices) + totalFlops = self.totalFlopsForProblemIndices(problemIndices) + for solutionIdx in range(0, self.numSolutions): + gflops = self.data[problemSerial+solutionIdx] + timeUs = totalFlops / gflops / 1000 + scores[solutionIdx] += timeUs + return scores + + + ############################################################################## + # Score Logic Complexity + def scoreLogicComplexity(self, logic, logicComplexity): + depth = self.getLogicDepth(logic) + if depth == 0: return + depth = self.numIndices - depth + currentLogic = logic + for i in range(0, len(logic)): + logicComplexity[depth] += 1 + self.scoreLogicComplexity(logic[i][1], logicComplexity) + + + ############################################################################## + # Get Logic Depth + def getLogicDepth(self, logic): + obj = logic + depth = 0 + while isinstance(obj, list): + obj = obj[0][1] + depth += 1 + return depth + + + ############################################################################## + # To Index Order + def toIndexOrder(self, problemIndices): + ordered = [] + for i in self.indexOrder: + ordered.append(problemIndices[i]) + return ordered + + + ############################################################################## + # Total Flops For Problem Indices + def totalFlopsForProblemIndices(self, problemIndices): + totalFlops = self.flopsPerMac + for i in range(0, self.numIndices): + totalFlops *= self.problemIndexToSize[i][problemIndices[i]] + return totalFlops + + + ############################################################################## + # Recommended Index Order + # TODO, this may depend on transposes + def recommendedIndexOrder(self): + order = [] + for i in range(0, self.numIndices): + if i != self.idxU and i != self.idx1 and i != self.idx0: + order.append(i) + order.append(self.idxU) + order.append(self.idx0) + order.append(self.idx1) + return order + + ############################################################################## + # Problem Indices For Range + def problemIndicesForRange(self, indexRange): + problemIndexList = [] + problemIndices = [] + for idx in indexRange: + problemIndices.append(idx[0]) + moreProblems = True + while moreProblems: + problemIndexList.append(deepcopy(problemIndices)) + # next problem + problemIndices[0] += 1 + for i in range(0, self.numIndices): + if problemIndices[i] >= indexRange[i][1]: + if i == self.numIndices-1: + moreProblems = False + break + else: + problemIndices[i] = indexRange[i][0] + problemIndices[i+1] += 1 + else: + break + return problemIndexList + + + ############################################################################## + # Get Size Free def getSizeFree(self, problemIndices): sizeFree = 1 - for i in range(0, self.problemType["NumIndicesC"]): + for i in self.rangeIndicesFree: sizeFree *= self.problemIndexToSize[i][problemIndices[i]] return sizeFree + + ############################################################################## + # Get Size Summation def getSizeSummation(self, problemIndices): sizeSummation = 1 - for i in range(self.problemType["NumIndicesC"], \ - self.problemType["TotalIndices"]): + for i in self.rangeIndicesSummation: sizeSummation *= self.problemIndexToSize[i][problemIndices[i]] return sizeSummation - ############################################################################## - # Print Data - ############################################################################## - def printData(self): - print2("serial; idxD0, idxD1, idxDU, idxOthers; sizeD0, sizeD1, sizeDU, sizeOthers; sol0, sol1, sol2, ...") - indices = [0]*self.numIndices - for serial in range(0, self.totalProblems): - s = "[%4u] [%2u" % (serial, indices[0]) - for i in range(1, self.numIndices): - s += ", %2u" % indices[i] - s += "] [%4u" % self.problemIndexToSize[0][indices[0]] - for i in range(1, self.numIndices): - s += ", %4u" % self.problemIndexToSize[i][indices[i]] - s += "]: %9.3f" % self.data[serial*self.numSolutions+0] - for i in range(1, self.numSolutions): - s += ", %9.3f" % self.data[serial*self.numSolutions+i] - print2(s) - indices[0] += 1 - for i in range(1, self.numIndices): - if indices[i-1] >= self.numProblemSizes[i-1]: - indices[i-1] = 0 - indices[i] += 1 ############################################################################## # Get Item - ############################################################################## def __getitem__(self, indexTuple): indices = indexTuple[0] # in analysis order solutionIdx = indexTuple[1] serial = self.indicesToSerial(solutionIdx, indices) return self.data[serial] + ############################################################################## - # Get Item - ############################################################################## + # Set Item def __setitem__(self, indexTuple, value): indices = indexTuple[0] # in analysis order solutionIdx = indexTuple[1] serial = self.indicesToSerial(solutionIdx, indices ) self.data[serial] = value + ############################################################################## # Indices -> Serial - ############################################################################## def indicesToSerial(self, solutionIdx, indices ): serial = 0 stride = 1 @@ -588,8 +929,13 @@ def indicesToSerial(self, solutionIdx, indices ): return serial + +################################################################################ +################################################################################ +### +### Main +### ################################################################################ -# Main ################################################################################ def main( config ): print2("# LibraryLogic config: %s" % config) @@ -643,8 +989,17 @@ def main( config ): # Run Analysis schedulePrefix = globalParameters["Name"] for problemTypeTuple in problemTypeTuples: - logic = analyzeProblemType( problemTypeTuple, analysisParameters ) + logicTuple = analyzeProblemType( problemTypeTuple, analysisParameters ) YAMLIO.writeLibraryLogicForProblemType(globalParameters["WorkingPath"], \ - schedulePrefix, logic) + schedulePrefix, logicTuple) popWorkingPath() + +######################################## +# TODO +# - different weights for different levels? +# are there pairs of weights that would result in same logic complexity but better score? + +######################################## +# TODO problems which this algorithm +# - barrier to switching may not always be amortised on next step, need to calculate several steps into future to see if net win; process needs to be a search tree. 32x32 search only takes 1 second diff --git a/Tensile/SolutionStructs.py b/Tensile/SolutionStructs.py index b21f98728..e3d66be05 100644 --- a/Tensile/SolutionStructs.py +++ b/Tensile/SolutionStructs.py @@ -20,7 +20,7 @@ ################################################################################ -from Common import globalParameters, defaultProblemType, assignParameterWithDefault, printExit, assignParameterRequired, defaultSolution, derrivedParameters +from Common import globalParameters, defaultProblemType, assignParameterWithDefault, printExit, assignParameterRequired, defaultSolution, derivedParameters from copy import deepcopy ################################################################################ @@ -95,9 +95,9 @@ def zeroString(self, backend): zeroString = "(%s)(" % self.toDevice(backend) if self.value == self.single or self.value == self.half: zeroString += "0.f" - elif self.value == self.double: + elif self.value == self.double: zeroString += "0.0" - elif self.value == self.complexSingle: + elif self.value == self.complexSingle: zeroString += "0.f, 0.f" elif self.value == self.complexDouble: zeroString += "0.0, 0.0" @@ -123,6 +123,8 @@ def numRegisters( self ): return self.properties[self.value][self.idxReg] def numBytes( self ): return self.numRegisters() * 4 + def flopsPerMac(self): + return 2 if self.isReal() else 8 def __str__(self): return self.toChar() @@ -214,7 +216,8 @@ def __init__(self, config): elif self["OperationType"] == "TensorContraction": self.initTensorContraction(config) - self.assignIndices() + self.state["AssignedDerivedParameters"] = False + ProblemType.assignDerivedParameters(self.state) ######################################## @@ -249,90 +252,98 @@ def isTensorContraction(self): ######################################## # determine d0, d1, dU - def assignIndices(self): - self["TotalIndices"] = max(max(self["IndexAssignmentsA"])+1, max(self["IndexAssignmentsB"])+1) + @staticmethod + def assignDerivedParameters(state): + if "AssignedDerivedParameters" in state: + if state["AssignedDerivedParameters"]: + return + state["AssignedDerivedParameters"] = False + + state["TotalIndices"] = max(max(state["IndexAssignmentsA"])+1, \ + max(state["IndexAssignmentsB"])+1) # determine num free, batch - self["IndicesFree"] = [] - self["IndicesBatch"] = [] - self["IndicesSummation"] = [] + state["IndicesFree"] = [] + state["IndicesBatch"] = [] + state["IndicesSummation"] = [] - for i in range(0, self["NumIndicesC"]): - inA = i in self["IndexAssignmentsA"] - inB = i in self["IndexAssignmentsB"] + for i in range(0, state["NumIndicesC"]): + inA = i in state["IndexAssignmentsA"] + inB = i in state["IndexAssignmentsB"] if inA and inB: - #self["NumIndicesBatch"] = (i+1)-self["NumIndicesFree"] - self["IndicesBatch"].append(i) + #state["NumIndicesBatch"] = (i+1)-state["NumIndicesFree"] + state["IndicesBatch"].append(i) elif inA or inB: - #self["NumIndicesFree"] = (i+1) - self["IndicesFree"].append(i) + #state["NumIndicesFree"] = (i+1) + state["IndicesFree"].append(i) else: printExit("invalid index %u" % i) # determine num summation - for i in range(self["NumIndicesC"], self["TotalIndices"]): - inA = i in self["IndexAssignmentsA"] - inB = i in self["IndexAssignmentsB"] + for i in range(state["NumIndicesC"], state["TotalIndices"]): + inA = i in state["IndexAssignmentsA"] + inB = i in state["IndexAssignmentsB"] if inA and inB: - #self["NumIndicesSummation"] = (i+1)-self["NumIndicesC"] - self.state["IndicesSummation"].append(i) + #state["NumIndicesSummation"] = (i+1)-state["NumIndicesC"] + state["IndicesSummation"].append(i) else: printExit("invalid index %u" % i) - self["NumIndicesFree"] = len(self["IndicesFree"]) - self["NumIndicesBatch"] = len(self["IndicesBatch"]) - self["NumIndicesSummation"] = len(self["IndicesSummation"]) + state["NumIndicesFree"] = len(state["IndicesFree"]) + state["NumIndicesBatch"] = len(state["IndicesBatch"]) + state["NumIndicesSummation"] = len(state["IndicesSummation"]) # by default, unroll index will be the first summation index # TODO sort summation indices by "stride" - self["IndexUnroll"] = self["IndicesSummation"][0] - for i in range(0, len(self["IndexAssignmentsA"])): - if self["IndexAssignmentsA"][i] == self["IndexUnroll"]: - self["IndexUnrollA"] = i + state["IndexUnroll"] = state["IndicesSummation"][0] + for i in range(0, len(state["IndexAssignmentsA"])): + if state["IndexAssignmentsA"][i] == state["IndexUnroll"]: + state["IndexUnrollA"] = i break - for i in range(0, len(self["IndexAssignmentsB"])): - if self["IndexAssignmentsB"][i] == self["IndexUnroll"]: - self["IndexUnrollB"] = i + for i in range(0, len(state["IndexAssignmentsB"])): + if state["IndexAssignmentsB"][i] == state["IndexUnroll"]: + state["IndexUnrollB"] = i break # assign d0, d1 - self["Index01A"] = -1 - self["Index01B"] = -1 - for i in self["IndexAssignmentsA"]: - if i < self["NumIndicesC"]: - self["Index01A"] = i + state["Index01A"] = -1 + state["Index01B"] = -1 + for i in state["IndexAssignmentsA"]: + if i < state["NumIndicesC"]: + state["Index01A"] = i break - for i in self["IndexAssignmentsB"]: - if i < self["NumIndicesC"]: - self["Index01B"] = i + for i in state["IndexAssignmentsB"]: + if i < state["NumIndicesC"]: + state["Index01B"] = i break # whichever has lower stride in C (lower value), is 0, other is 1 - if self["Index01A"] < self["Index01B"]: - self["Index0"] = self["Index01A"] - self["Index1"] = self["Index01B"] - self["Tensor0"] = 0 - self["Tensor1"] = 1 - self["TileA"] = 0 - self["TileB"] = 1 + if state["Index01A"] < state["Index01B"]: + state["Index0"] = state["Index01A"] + state["Index1"] = state["Index01B"] + state["Tensor0"] = 0 + state["Tensor1"] = 1 + state["TileA"] = 0 + state["TileB"] = 1 else: - self["Index0"] = self["Index01B"] - self["Index1"] = self["Index01A"] - self["Tensor0"] = 1 - self["Tensor1"] = 0 - self["TileA"] = 1 - self["TileB"] = 0 + state["Index0"] = state["Index01B"] + state["Index1"] = state["Index01A"] + state["Tensor0"] = 1 + state["Tensor1"] = 0 + state["TileA"] = 1 + state["TileB"] = 0 # generalize transpose - strideIdxA = self["IndexAssignmentsA"].index(self["Index01A"]) - strideIdxB = self["IndexAssignmentsB"].index(self["Index01B"]) - unrollIdxA = self["IndexAssignmentsA"].index(self["IndexUnroll"]) - unrollIdxB = self["IndexAssignmentsB"].index(self["IndexUnroll"]) - self["TLUA"] = strideIdxA < unrollIdxA - self["TLUB"] = strideIdxB < unrollIdxB + strideIdxA = state["IndexAssignmentsA"].index(state["Index01A"]) + strideIdxB = state["IndexAssignmentsB"].index(state["Index01B"]) + unrollIdxA = state["IndexAssignmentsA"].index(state["IndexUnroll"]) + unrollIdxB = state["IndexAssignmentsB"].index(state["IndexUnroll"]) + state["TLUA"] = strideIdxA < unrollIdxA + state["TLUB"] = strideIdxB < unrollIdxB #unrollDimStrideGreaterThanTileDimStrideA = TLUA #unrollDimStrideLessThanTileDimStrideB = !TLUB + state["AssignedDerivedParameters"] = True @@ -364,6 +375,18 @@ def __str__(self): if self["UseInitialStrides"]: name += "I" return name + def keys(self): + return self.state.keys() + def __len__(self): + return len(self.state) + def __iter__(self): + return iter(self.state) + + + + + + def __getitem__(self, key): return self.state[key] def __setitem__(self, key, value): @@ -504,8 +527,10 @@ def __init__(self, config): for key in config: if key != "ProblemType" and key not in self.state: self.state[key] = config[key] - - Solution.assignDimsFromEdgeAndShape(self.state) + self["Valid"] = True + self["AssignedProblemIndependentDerivedParameters"] = False + self["AssignedDerivedParameters"] = False + Solution.assignDerivedParameters(self.state) ######################################## # get a list of kernel parameters for this solution @@ -529,24 +554,28 @@ def getKernels(self): ######################################## - # assign Dim0, 1 based on edge and shape + # assign tile sizes @staticmethod - def assignDimsFromEdgeAndShape(state): + def assignProblemIndependentDerivedParameters(state): + if "AssignedProblemIndependentDerivedParameters" in state: + if state["AssignedProblemIndependentDerivedParameters"]: + return + state["AssignedProblemIndependentDerivedParameters"] = False # workgroup sizes state["WorkGroup0"] = state["WorkGroupEdge"] state["WorkGroup1"] = state["WorkGroupEdge"] - if state["WorkGroupShape"] == 1: - state["WorkGroup1"] *= 2 - elif state["WorkGroupShape"] == -1: - state["WorkGroup0"] *= 2 + if state["WorkGroupShape"] > 0: + state["WorkGroup1"] *= abs(state["WorkGroupShape"]) + elif state["WorkGroupShape"] < 0: + state["WorkGroup0"] *= abs(state["WorkGroupShape"]) # thread tile sizes state["ThreadTile0"] = state["ThreadTileEdge"] state["ThreadTile1"] = state["ThreadTileEdge"] - if state["ThreadTileShape"] == 1: - state["ThreadTile1"] *= 2 - elif state["ThreadTileShape"] == -1: - state["ThreadTile0"] *= 2 + if state["ThreadTileShape"] > 0: + state["ThreadTile1"] *= abs(state["ThreadTileShape"]) + elif state["ThreadTileShape"] < 0: + state["ThreadTile0"] *= abs(state["ThreadTileShape"]) # macro tile sizes if "WorkGroup0" in state and "ThreadTile0" in state: @@ -556,6 +585,267 @@ def assignDimsFromEdgeAndShape(state): if "SplitU" in state and "LoopUnroll" in state: state["DepthU"] = state["SplitU"] * state["LoopUnroll"] + printReason = False + # num threads + state["NumThreads"] = state["WorkGroup0"]*state["WorkGroup1"] + if state["NumThreads"] > globalParameters["MaxThreads"]: + if printReason: print2("rejecting %u threads" % state["NumThreads"]) + state["Valid"] = False + if state["NumThreads"] < globalParameters["MinThreads"]: + if printReason: print2("rejecting %u threads" % state["NumThreads"]) + state["Valid"] = False + + # tile shape + if state["MacroTile0"]/state["MacroTile1"] > globalParameters["MaxMacroTileRatio"] \ + or state["MacroTile1"]/state["MacroTile0"] > globalParameters["MaxMacroTileRatio"]: + state["Valid"] = False + + # done + state["AssignedProblemIndependentDerivedParameters"] = True + + + ######################################## + # assign all derived parameters + @staticmethod + def assignDerivedParameters(state): + Solution.assignProblemIndependentDerivedParameters(state) + if "AssignedDerivedParameters" in state: + if state["AssignedDerivedParameters"]: + return + state["AssignedDerivedParameters"] = False + + ProblemType.assignDerivedParameters(state["ProblemType"]) + printReason = False + + # tile size + if state["ThreadTile0"]*state["ThreadTile1"]*state["ProblemType"]["DataType"].numRegisters() > globalParameters["MaxThreadTile"]: + state["Valid"] = False + + # how many elements to load + if state["ProblemType"]["TLUA"]: + totalElementsCoalescedA = state["MacroTile0"] + totalElementsPerpA = state["LoopUnroll"] + else: + totalElementsCoalescedA = state["LoopUnroll"] + totalElementsPerpA = state["MacroTile0"] + + if state["ProblemType"]["TLUB"]: + totalElementsCoalescedB = state["MacroTile1"] + totalElementsPerpB = state["LoopUnroll"] + else: + totalElementsCoalescedB = state["LoopUnroll"] + totalElementsPerpB = state["MacroTile1"] + totalElementsA = totalElementsCoalescedA * totalElementsPerpA + totalElementsB = totalElementsCoalescedB * totalElementsPerpB + + # how many load instructions + if totalElementsA % state["NumThreads"] != 0: + if printReason: print2("totalElementsA %u %% NumThreads %u != 0" \ + % (totalElementsA, state["NumThreads"])) + state["Valid"] = False + return + else: + state["NumLoadsA"] = totalElementsA / state["NumThreads"] + if totalElementsB % state["NumThreads"] != 0: + if printReason: print2("totalElementsB %u %% NumThreads %u != 0" \ + % (totalElementsB, state["NumThreads"])) + state["Valid"] = False + return + state["NumLoadsB"] = totalElementsB / state["NumThreads"] + else: + state["NumLoadsB"] = totalElementsB / state["NumThreads"] + + # nlca = 1 + if state["NumLoadsCoalescedA"] == 1: + foundValid = False + for nlca in range(1, state["NumLoadsA"]+1): + nlpa = state["NumLoadsA"] / nlca + if state["NumLoadsA"] % nlca == 0 \ + and totalElementsCoalescedA % nlca == 0 \ + and totalElementsPerpA % nlpa == 0: + state["NumLoadsCoalescedA"] = nlca + state["NumLoadsPerpendicularA"] = nlpa + foundValid = True + break + if not foundValid: + state["Valid"] = False + return + + # nlca = -1 + elif state["NumLoadsCoalescedA"] == -1: + foundValid = False + for nlca in range(state["NumLoadsA"], 0, -1): + nlpa = state["NumLoadsA"] / nlca + if state["NumLoadsA"] % nlca == 0 \ + and totalElementsCoalescedA % nlca == 0 \ + and totalElementsPerpA % nlpa == 0: + state["NumLoadsCoalescedA"] = nlca + state["NumLoadsPerpendicularA"] = nlpa + foundValid = True + break + if not foundValid: + state["Valid"] = False + return + + # nlca = other + else: + state["NumLoadsPerpendicularA"] = state["NumLoadsA"] \ + / state["NumLoadsCoalescedA"] + + if state["NumLoadsA"] % state["NumLoadsCoalescedA"] != 0: + if printReason: print2("numLoadsA %u %% numLoadsParaA %u != 0" \ + % (state["NumLoadsA"], state["NumLoadsCoalescedA"])) + state["Valid"] = False + if totalElementsCoalescedA % state["NumLoadsCoalescedA"] != 0: + if printReason: print2("totalElementsCoalescedA %u %% numLoadsParaA %u != 0" \ + % (totalElementsCoalescedA, state["NumLoadsCoalescedA"])) + state["Valid"] = False + return + if totalElementsPerpA % state["NumLoadsPerpendicularA"] != 0: + if printReason: print2("totalElementsPerpA %u %% numLoadsPerpA %u != 0" \ + % (totalElementsPerpA, state["NumLoadsPerpendicularA"])) + state["Valid"] = False + return + + + + + + # nlcb = 1 + if state["NumLoadsCoalescedB"] == 1: + foundValid = False + for nlca in range(1, state["NumLoadsB"]+1): + nlpa = state["NumLoadsB"] / nlca + if state["NumLoadsB"] % nlca == 0 \ + and totalElementsCoalescedB % nlca == 0 \ + and totalElementsPerpB % nlpa == 0: + state["NumLoadsCoalescedB"] = nlca + state["NumLoadsPerpendicularB"] = nlpa + foundValid = True + break + if not foundValid: + state["Valid"] = False + return + + # nlcb = -1 + elif state["NumLoadsCoalescedB"] == -1: + foundValid = False + for nlca in range(state["NumLoadsB"], 0, -1): + nlpa = state["NumLoadsB"] / nlca + if state["NumLoadsB"] % nlca == 0 \ + and totalElementsCoalescedB % nlca == 0 \ + and totalElementsPerpB % nlpa == 0: + state["NumLoadsCoalescedB"] = nlca + state["NumLoadsPerpendicularB"] = nlpa + foundValid = True + break + if not foundValid: + state["Valid"] = False + return + + # nlcb = other + else: + state["NumLoadsPerpendicularB"] = state["NumLoadsB"] \ + / state["NumLoadsCoalescedB"] + + if state["NumLoadsB"] % state["NumLoadsCoalescedB"] != 0: + if printReason: print2("numLoadsB %u %% numLoadsParaB %u != 0" \ + % (state["NumLoadsB"], state["NumLoadsCoalescedB"])) + state["Valid"] = False + if totalElementsCoalescedB % state["NumLoadsCoalescedB"] != 0: + if printReason: print2("totalElementsCoalescedB %u %% numLoadsParaB %u != 0" \ + % (totalElementsCoalescedB, state["NumLoadsCoalescedB"])) + state["Valid"] = False + return + if totalElementsPerpB % state["NumLoadsPerpendicularB"] != 0: + if printReason: print2("totalElementsPerpB %u %% numLoadsPerpB %u != 0" \ + % (totalElementsPerpB, state["NumLoadsPerpendicularB"])) + state["Valid"] = False + return + + + + + + + + + """ + if state["NumLoadsCoalescedB"] < 1: + state["NumLoadsCoalescedB"] = state["NumLoadsB"] + if state["NumLoadsB"] % state["NumLoadsCoalescedB"] != 0: + if printReason: print2("numLoadsB %u %% numLoadsParaB %u != 0" \ + % (state["NumLoadsB"], state["NumLoadsCoalescedB"])) + state["Valid"] = False + return + else: + state["NumLoadsPerpendicularB"] = state["NumLoadsB"] \ + / state["NumLoadsCoalescedB"] + + + # load size para/perp B + if totalElementsCoalescedB % state["NumLoadsCoalescedB"] != 0: + if printReason: print2("totalElementsCoalescedB %u %% numLoadsParaB %u != 0" \ + % (totalElementsCoalescedB, state["NumLoadsCoalescedB"])) + state["Valid"] = False + return + #else: + # loadSizeParaB = totalElementsCoalescedB / state["NumLoadsCoalescedB"] + if totalElementsPerpB % state["NumLoadsPerpendicularB"] != 0: + if printReason: print2("totalElementsPerpB %u %% numLoadsPerpB %u != 0" \ + % (totalElementsPerpB, state["NumLoadsPerpendicularB"])) + state["Valid"] = False + return + #else: + # loadSizePerpB = totalElementsPerpB / state["NumLoadsPerpendicularB"] + """ + + + + + + + + + + # too much LDS + sizeLDS = state["LoopUnroll"] \ + * (state["PadLDS"] * 2 + state["MacroTile0"] \ + + state["MacroTile1"] ) \ + * state["ProblemType"]["DataType"].numBytes() + if sizeLDS > globalParameters["MaxLDS"]: + if printReason: print2("Kernel Uses %u > %u bytes" % ( sizeLDS, globalParameters["MaxLDS"])) + state["Valid"] = False + return + + # Compiler may be causing incorrect spills on ROCm1.4 from DT on 2/21/17 + if globalParameters["Backend"] == "HIP": + if state["ProblemType"]["DataType"].value == DataType.single: + if state["MacroTile0"] == 128 or state["MacroTile1"] == 128: + if state["NumLoadsCoalescedA"] != 1 and state["NumLoadsCoalescedB"] != 8: + state["Valid"] = False + #return + elif state["ProblemType"]["DataType"].value == DataType.double: + if globalParameters["Backend"] == "HIP": + if state["MacroTile0"] >= 64 or state["MacroTile1"] >= 64: + state["Valid"] = False + #return + state["AssignedDerivedParameters"] = True + + +# validation failures +# Cijk_Ailk_Bjlk_SB_DU16_LU16_MT064_MT164_NLA16_NLB16_NLCA02_NLCB01_NLPA08_NLPB16_TT008_TT108_TTE08_WG008_WG108_WGE08 +# Cijk_Ailk_Bjlk_SB_DU16_LU16_MT064_MT164_NLA16_NLB16_NLCA04_NLCB02_NLPA04_NLPB08_TT008_TT108_TTE08_WG008_WG108_WGE08 +# Cijk_Ailk_Bjlk_SB_DU16_LU16_MT064_MT164_NLA16_NLB16_NLCA02_NLCB04_NLPA08_NLPB04_TT008_TT108_TTE08_WG008_WG108_WGE08 + +# Cijk_Ailk_Bjlk_DB_DU16_LU16_MT064_MT164_NLA16_NLB16_NLCA04_NLCB01_NLPA04_NLPB16_TT008_TT108_TTE08_WG008_WG108_WGE08 +# Cijk_Ailk_Bjlk_DB_DU08_LU08_MT064_MT164_NLA08_NLB08_NLCA01_NLCB01_NLPA08_NLPB08_TT008_TT108_TTE08_WG008_WG108_WGE08 +# Cijk_Ailk_Bjlk_DB_DU08_LU08_MT064_MT164_NLA08_NLB08_NLCA08_NLCB01_NLPA01_NLPB08_TT008_TT108_TTE08_WG008_WG108_WGE08 +# Cijk_Ailk_Bjlk_DB_DU08_LU08_MT064_MT164_NLA08_NLB08_NLCA08_NLCB08_NLPA01_NLPB01_TT008_TT108_TTE08_WG008_WG108_WGE08 +# Cijk_Ailk_Bjlk_DB_DU16_LU16_MT064_MT164_NLA16_NLB16_NLCA08_NLCB08_NLPA02_NLPB02_TT008_TT108_TTE08_WG008_WG108_WGE08 +# Cijk_Ailk_Bjlk_DB_DU08_LU08_MT064_MT164_NLA08_NLB08_NLCA01_NLCB08_NLPA08_NLPB01_TT008_TT108_TTE08_WG008_WG108_WGE08 + + ######################################## # create a dictionary with booleans on whether to include parameter in name @@ -573,14 +863,16 @@ def getMinNaming(objs): # only 1, rather than name being nothing, it'll be everything if len(objs) == 1: for key in keys: - requiredParameters[key] = False + if key not in derivedParameters: + requiredParameters[key] = False else: for key in keys: required = False - for i in range(1, len(objs)): - if objs[0][key] != objs[i][key]: - required = True - break + if key not in derivedParameters: + for i in range(1, len(objs)): + if objs[0][key] != objs[i][key]: + required = True + break if required: requiredParameters[key] = True else: @@ -617,8 +909,6 @@ def getNameMin(state, requiredParameters): first = False name += "%s%s" % ( Solution.getParameterNameAbbreviation(key), \ Solution.getParameterValueAbbreviation(state[key]) ) - #else: - # print "%s not in %s" % (key, requiredParameters) return name ######################################## @@ -627,26 +917,20 @@ def getNameMin(state, requiredParameters): def getSerialNaming(objs): data = {} for objIdx in range(0, len(objs)): - #print "ObjIdx: %u" % objIdx obj = objs[objIdx] for paramName in sorted(obj.keys()): - if paramName not in derrivedParameters: + if paramName not in derivedParameters: paramValue = obj[paramName] - #if paramName == "ThreadTileEdge": - # print "%s = %s" % (paramName, paramValue) if paramName in data: if paramValue not in data[paramName]: data[paramName].append(paramValue) else: data[paramName] = [ paramValue ] maxObjs = 1 - #print "SerialNaming:" for paramName in data: data[paramName] = sorted(data[paramName]) - #print "%s: %s" % (paramName, data[paramName]) maxObjs *= len(data[paramName]) numDigits = len(str(maxObjs)) - #print "MaxSerialNames: %u (%u)" % (maxObjs, numDigits) return [ data, numDigits ] ######################################## @@ -659,29 +943,16 @@ def getNameSerial(state, serialNaming): serial = 0 multiplier = 1 for paramName in sorted(state.keys()): - if paramName not in derrivedParameters: + if paramName not in derivedParameters: paramValue = state[paramName] paramData = data[paramName] paramNameMultiplier = len(paramData) if paramValue in paramData: paramValueIdx = paramData.index(paramValue) - #else: - #print "ERROR %s: %s not in %s" % ( paramName, paramValue, paramData ) - #print state - #printExit() - #if paramNameMultiplier > 1: - #print "serial = %u*%u + %u; multiplier = %u * %u; %s::%s in %s" % ( \ - # paramValueIdx, multiplier, serial, \ - # paramNameMultiplier, multiplier, \ - # paramName, paramValue, paramData[1] ) - serial += paramValueIdx * multiplier multiplier *= paramNameMultiplier - #if serial == 0: - # print state name = "%s%0*u" % ("S" if isinstance(state, Solution) else "K", \ numDigits, serial) - #print "SerialName: %s" % name return name @@ -746,7 +1017,8 @@ def __hash__(self): return hash(str(self)) #return hash(self.getAttributes()) def __eq__(self, other): - return isinstance(other, Solution) and self.getAttributes() == other.getAttributes() + #return isinstance(other, Solution) and self.getAttributes() == other.getAttributes() + return isinstance(other, Solution) and str(self) == str(other) def __ne__(self, other): result = self.__eq__(other) if result is NotImplemented: diff --git a/Tensile/SolutionWriter.py b/Tensile/SolutionWriter.py index c09b72d70..ea7585b57 100644 --- a/Tensile/SolutionWriter.py +++ b/Tensile/SolutionWriter.py @@ -787,123 +787,3 @@ def getHeaderFileString(self, solution): return fileStr - ############################################################################## - # are solution parameters (dict) self-consistent - ############################################################################## - @ staticmethod - def solutionParametersConsistent(solution): - printReason = False - - numThreads = solution["WorkGroup0"]*solution["WorkGroup1"] - if numThreads > globalParameters["MaxThreads"]: - if printReason: print2("rejecting %u threads" % numThreads) - return False - - # how many elements to load - if solution["ProblemType"]["TLUA"]: - totalElementsParaA = solution["MacroTile0"] - totalElementsPerpA = solution["LoopUnroll"] - else: - totalElementsParaA = solution["LoopUnroll"] - totalElementsPerpA = solution["MacroTile0"] - - if solution["ProblemType"]["TLUB"]: - totalElementsParaB = solution["MacroTile1"] - totalElementsPerpB = solution["LoopUnroll"] - else: - totalElementsParaB = solution["LoopUnroll"] - totalElementsPerpB = solution["MacroTile1"] - totalElementsA = totalElementsParaA * totalElementsPerpA - totalElementsB = totalElementsParaB * totalElementsPerpB - - # how many load instructions - if totalElementsA % numThreads != 0: - if printReason: print2("totalElementsA %u %% numThreads %u != 0" \ - % (totalElementsA, numThreads)) - return False - else: - solution["NumLoadsA"] = totalElementsA / numThreads - if totalElementsB % numThreads != 0: - if printReason: print2("totalElementsB %u %% numThreads %u != 0" \ - % (totalElementsB, numThreads)) - return False - else: - solution["NumLoadsB"] = totalElementsB / numThreads - - # how many loads para - if solution["NumLoadsA"] % solution["NumLoadsCoalescedA"] != 0: - if printReason: print2("numLoadsA %u %% numLoadsParaA %u != 0" \ - % (solution["NumLoadsA"], solution["NumLoadsCoalescedA"])) - return False - else: - solution["NumLoadsPerpendicularA"] = solution["NumLoadsA"] \ - / solution["NumLoadsCoalescedA"] - if solution["NumLoadsB"] % solution["NumLoadsCoalescedB"] != 0: - if printReason: print2("numLoadsB %u %% numLoadsParaB %u != 0" \ - % (solution["NumLoadsB"], solution["NumLoadsCoalescedB"])) - return False - else: - solution["NumLoadsPerpendicularB"] = solution["NumLoadsB"] \ - / solution["NumLoadsCoalescedB"] - - # load size para/perp A - if totalElementsParaA % solution["NumLoadsCoalescedA"] != 0: - if printReason: print2("totalElementsParaA %u %% numLoadsParaA %u != 0" \ - % (totalElementsParaA, solution["NumLoadsCoalescedA"])) - return False - #else: - # loadSizeParaA = totalElementsParaA / solution["NumLoadsCoalescedA"] - if totalElementsPerpA % solution["NumLoadsPerpendicularA"] != 0: - if printReason: print2("totalElementsPerpA %u %% numLoadsPerpA %u != 0" \ - % (totalElementsPerpA, solution["NumLoadsPerpendicularA"])) - return False - #else: - # loadSizePerpA = totalElementsPerpA / solution["NumLoadsPerpendicularA"] - - # load size para/perp B - if totalElementsParaB % solution["NumLoadsCoalescedB"] != 0: - if printReason: print2("totalElementsParaB %u %% numLoadsParaB %u != 0" \ - % (totalElementsParaB, solution["NumLoadsCoalescedB"])) - return False - #else: - # loadSizeParaB = totalElementsParaB / solution["NumLoadsCoalescedB"] - if totalElementsPerpB % solution["NumLoadsPerpendicularB"] != 0: - if printReason: print2("totalElementsPerpB %u %% numLoadsPerpB %u != 0" \ - % (totalElementsPerpB, solution["NumLoadsPerpendicularB"])) - return False - #else: - # loadSizePerpB = totalElementsPerpB / solution["NumLoadsPerpendicularB"] - - # too much LDS - sizeLDS = solution["LoopUnroll"] \ - * (solution["PadLDS"] * 2 + solution["MacroTile0"] \ - + solution["MacroTile1"] ) \ - * solution["ProblemType"]["DataType"].numBytes() - if sizeLDS > globalParameters["MaxLDS"]: - if printReason: print2("Kernel Uses %u > %u bytes" % ( sizeLDS, globalParameters["MaxLDS"])) - return False - - # Compiler may be causing incorrect spills on ROCm1.4 from DT on 2/21/17 - if globalParameters["Backend"] == "HIP": - if solution["ProblemType"]["DataType"].value == DataType.single: - if solution["MacroTile0"] == 128 or solution["MacroTile1"] == 128: - if solution["NumLoadsCoalescedA"] != 1 and solution["NumLoadsCoalescedB"] != 8: - return False - elif solution["ProblemType"]["DataType"].value == DataType.double: - if globalParameters["Backend"] == "HIP": - if solution["MacroTile0"] >= 64 or solution["MacroTile1"] >= 64: - return False -# validation failures -# Cijk_Ailk_Bjlk_SB_DU16_LU16_MT064_MT164_NLA16_NLB16_NLCA02_NLCB01_NLPA08_NLPB16_TT008_TT108_TTE08_WG008_WG108_WGE08 -# Cijk_Ailk_Bjlk_SB_DU16_LU16_MT064_MT164_NLA16_NLB16_NLCA04_NLCB02_NLPA04_NLPB08_TT008_TT108_TTE08_WG008_WG108_WGE08 -# Cijk_Ailk_Bjlk_SB_DU16_LU16_MT064_MT164_NLA16_NLB16_NLCA02_NLCB04_NLPA08_NLPB04_TT008_TT108_TTE08_WG008_WG108_WGE08 - -# Cijk_Ailk_Bjlk_DB_DU16_LU16_MT064_MT164_NLA16_NLB16_NLCA04_NLCB01_NLPA04_NLPB16_TT008_TT108_TTE08_WG008_WG108_WGE08 -# Cijk_Ailk_Bjlk_DB_DU08_LU08_MT064_MT164_NLA08_NLB08_NLCA01_NLCB01_NLPA08_NLPB08_TT008_TT108_TTE08_WG008_WG108_WGE08 -# Cijk_Ailk_Bjlk_DB_DU08_LU08_MT064_MT164_NLA08_NLB08_NLCA08_NLCB01_NLPA01_NLPB08_TT008_TT108_TTE08_WG008_WG108_WGE08 -# Cijk_Ailk_Bjlk_DB_DU08_LU08_MT064_MT164_NLA08_NLB08_NLCA08_NLCB08_NLPA01_NLPB01_TT008_TT108_TTE08_WG008_WG108_WGE08 -# Cijk_Ailk_Bjlk_DB_DU16_LU16_MT064_MT164_NLA16_NLB16_NLCA08_NLCB08_NLPA02_NLPB02_TT008_TT108_TTE08_WG008_WG108_WGE08 -# Cijk_Ailk_Bjlk_DB_DU08_LU08_MT064_MT164_NLA08_NLB08_NLCA01_NLCB08_NLPA08_NLPB01_TT008_TT108_TTE08_WG008_WG108_WGE08 - - return True - diff --git a/Tensile/Source/Client.h b/Tensile/Source/Client.h index eee665ca4..401b6ab8f 100644 --- a/Tensile/Source/Client.h +++ b/Tensile/Source/Client.h @@ -30,6 +30,7 @@ #include TensileTimer timer; +TensileTimer apiTimer; std::ofstream file; void initControls(); @@ -135,7 +136,8 @@ bool callLibrary( std::cout << " Device | Reference" << std::endl; firstPrint = false; } - std::cout << "[" << (numChecked-1) << "] " << i << ": " << tensileToString(deviceOnHostC[i]) + std::cout << "[" << (numChecked-1) << "] " << i << ": " + << tensileToString(deviceOnHostC[i]) << (equal ? "==" : "!=") << tensileToString(referenceC[i]) << std::endl; printIdx++; @@ -146,10 +148,14 @@ bool callLibrary( // time solution timer.start(); + double apiTimeUs = 0; for (unsigned int syncIdx = 0; syncIdx < numSyncsPerBenchmark; syncIdx++) { + apiTimer.start(); for (unsigned int enqIdx = 0; enqIdx < numEnqueuesPerSync; enqIdx++) { generatedCallToFunction( userSizes, alpha, beta ); } + double currentApiTimeUs = apiTimer.elapsed_us() / numEnqueuesPerSync; + apiTimeUs += currentApiTimeUs; // sync #if Tensile_BACKEND_OCL status = clFinish(stream); tensileStatusCheck(status); @@ -158,6 +164,7 @@ bool callLibrary( #endif tensileStatusCheck(status); } // sync loop + apiTimeUs /= numSyncsPerBenchmark; double timeMs = timer.elapsed_ms() / numSyncsPerBenchmark / numEnqueuesPerSync; @@ -174,30 +181,35 @@ bool callLibrary( << numFunctions << "]:" << std::setw(10) << std::fixed << std::setprecision(3) << gflops << " GFlop/s"; - if (newFastest) { - std::cout << "*"; - } else { - std::cout << " "; - } + if (newFastest) { + std::cout << "*"; + } else { + std::cout << " "; + } std::cout << " |" << std::setw(9) << std::fixed << std::setprecision(3) << timeMs << " ms | v: " << (numInvalids ? "FAILED" : "PASSED") - << " p: " << (numChecked-numInvalids) << "/" << numChecked << std::endl; + << " " << (numChecked-numInvalids) << "/" << numChecked; + std::cout << " | api:" << std::setw(6) << std::fixed + << std::setprecision(3) << apiTimeUs << " us"; + std::cout << std::endl; } else { std::cout << "Function[" << functionIdx << "/" << numFunctions << "]:" << std::setw(10) << std::fixed << std::setprecision(3) << gflops << " GFlop/s"; - if (newFastest) { - std::cout << "*"; - } else { - std::cout << " "; - } + if (newFastest) { + std::cout << "*"; + } else { + std::cout << " "; + } std::cout << " |" << std::setw(9) << std::fixed << std::setprecision(3) << timeMs << " ms"; - if (newFastest) { - std::cout << "*"; - } - std::cout << std::endl; + if (newFastest) { + std::cout << "*"; + } + std::cout << " | api:" << std::setw(6) << std::fixed + << std::setprecision(3) << apiTimeUs << " us"; + std::cout << std::endl; } return (numInvalids > 0); } // callLibrary @@ -359,7 +371,7 @@ bool benchmarkAllSolutionsForSize( } std::cout << " |" << std::setw(9) << std::fixed << std::setprecision(3) << timeMs << " ms | v: " << (numInvalids ? "FAILED" : "PASSED") - << " p: " << (numChecked-numInvalids) << "/" << numChecked << std::endl; + << " " << (numChecked-numInvalids) << "/" << numChecked << std::endl; } #if 1 else { diff --git a/Tensile/TensileCreateLibrary.py b/Tensile/TensileCreateLibrary.py index ef7c13392..222a71d55 100644 --- a/Tensile/TensileCreateLibrary.py +++ b/Tensile/TensileCreateLibrary.py @@ -171,9 +171,8 @@ def writeLogic(outputPath, logicList, solutionWriter ): scheduleName = logicProblemType[0] problemType = logicProblemType[1] solutions = logicProblemType[2] - skinnyLogic0 = logicProblemType[3] - skinnyLogic1 = logicProblemType[4] - diagonalLogic = logicProblemType[5] + indexOrder = logicProblemType[3] + logic = logicProblemType[4] # solution names solutionNames = [] @@ -208,6 +207,7 @@ def writeLogic(outputPath, logicList, solutionWriter ): for i in range(0, len(argList)): s += " %s%s" % (argList[i], ",\n" if i < len(argList)-1 else ") {\n\n") + """ indent = " " s += "%ssize_t sizeC = size%s" % ( indent, indexChars[0]) for i in range(1, problemType["NumIndicesC"]): @@ -218,56 +218,11 @@ def writeLogic(outputPath, logicList, solutionWriter ): for i in range(1, len(problemType["IndicesSummation"])): s += "*size%s" % indexChars[problemType["IndicesSummation"][i]] s += ";\n\n" - for rule in skinnyLogic0: - print2(rule) - for rule in skinnyLogic1: - print2(rule) + """ print2(solutionNames) - for ruleIdx in range(0, len(diagonalLogic)): - rule = diagonalLogic[ruleIdx] - print2(rule) - winnerIdx = rule[0] - problemSize = rule[1] - minGFlops = rule[2] - maxGFlops = rule[3] - # rule logic - if ruleIdx == len(diagonalLogic)-1: - if len(diagonalLogic) > 1: - s += "%selse" % indent - else: - s += "%s" % indent - else: - s += "%s%s(sizeC >= static_cast(%u" % (indent, ("if" if ruleIdx == 0 else "else if"), problemSize[0]) - for i in range(1, problemType["NumIndicesC"]): - s += "*%u" % problemSize[i] - s += "))" - s += " return %s(" % solutionNames[winnerIdx] - # solution parameters - s += " dataC, dataA, dataB, alpha" - if problemType["UseBeta"]: - s += ", beta" - s += ", offsetC, offsetA, offsetB" - firstStride = 1 - if problemType["UseInitialStrides"]: - firstStride = 0 - lastStrideC = problemType["NumIndicesC"] - lastStrideA = len(problemType["IndexAssignmentsA"]) - lastStrideB = len(problemType["IndexAssignmentsB"]) - - for i in range(firstStride,lastStrideC): - s += ", strideC%u%s" % (i, indexChars[i]) - for i in range(firstStride,lastStrideA): - s += ", strideA%u%s" % (i, \ - indexChars[problemType["IndexAssignmentsA"][i]]) - for i in range(firstStride,lastStrideB): - s += ", strideB%u%s" % (i, \ - indexChars[problemType["IndexAssignmentsB"][i]]) - for i in range(0, problemType["TotalIndices"]): - s += ", size%s" % indexChars[i] - s += ", stream, numInputEvents, inputEvents, outputEvent ); /* [%f,%f] GFlops*/\n" % (minGFlops,maxGFlops) - - + logicStr = writeLogicRec(0, indexOrder, logic, solutionNames, problemType) + s += logicStr s += "\n}\n" # open and close individual files @@ -289,6 +244,72 @@ def writeLogic(outputPath, logicList, solutionWriter ): logicHeaderFile.write(h) logicHeaderFile.close() +################################################################################ +# Write Logic Recursive +################################################################################ +def writeLogicRec(depth, indexOrder, logic, solutionNames, problemType): + indexChars = globalParameters["IndexChars"] + indent = " " + indent += " "*depth + s = "" + lowestLevel = depth == len(indexOrder)-1 + numRules = len(logic) + for ruleIdx in range(0, numRules): + rule = logic[ruleIdx] + threshold = rule[0] + if lowestLevel: + solutionIdx = rule[1] + solutionCall = writeSolutionCall(solutionNames[solutionIdx],problemType) + if threshold > 0: + s += "%sif (size%s < %u) return %s;\n" \ + % (indent, indexChars[indexOrder[depth]], threshold, solutionCall) + else: + s += "%sreturn %s;\n" % (indent, solutionCall) + else: + if threshold > 0: + s += "%sif (size%s < %u) {\n" \ + % (indent, indexChars[indexOrder[depth]], threshold) + else: + s += "%s{\n" % (indent) + s += writeLogicRec(depth+1, indexOrder, rule[1], solutionNames, \ + problemType) + s += "%s}\n" % (indent) + return s + + +################################################################################ +# Write Solution Call +################################################################################ +def writeSolutionCall(solutionName, problemType): + indexChars = globalParameters["IndexChars"] + s = "" + s += "%s(" % solutionName + # solution parameters + s += " dataC, dataA, dataB, alpha" + if problemType["UseBeta"]: + s += ", beta" + s += ", offsetC, offsetA, offsetB" + firstStride = 1 + if problemType["UseInitialStrides"]: + firstStride = 0 + lastStrideC = problemType["NumIndicesC"] + lastStrideA = len(problemType["IndexAssignmentsA"]) + lastStrideB = len(problemType["IndexAssignmentsB"]) + for i in range(firstStride,lastStrideC): + s += ", strideC%u%s" % (i, indexChars[i]) + for i in range(firstStride,lastStrideA): + s += ", strideA%u%s" % (i, \ + indexChars[problemType["IndexAssignmentsA"][i]]) + for i in range(firstStride,lastStrideB): + s += ", strideB%u%s" % (i, \ + indexChars[problemType["IndexAssignmentsB"][i]]) + for i in range(0, problemType["TotalIndices"]): + s += ", size%s" % indexChars[i] + s += ", stream, numInputEvents, inputEvents, outputEvent )" + return s + + + ################################################################################ # Write CMake @@ -412,11 +433,12 @@ def TensileCreateLibrary(): printExit("LogicPath %s doesn't exist" % logicPath) logicFiles = [os.path.join(logicPath, f) for f in os.listdir(logicPath) \ - if os.path.isfile(os.path.join(logicPath, f))] + if (os.path.isfile(os.path.join(logicPath, f)) \ + and os.path.splitext(f)[1]==".yaml")] - print2("# LibraryLogicFiles:" % logicFiles) + print1("# LibraryLogicFiles:" % logicFiles) for logicFile in logicFiles: - print2("# %s" % logicFile) + print1("# %s" % logicFile) ############################################################################## # Parse config files @@ -424,10 +446,10 @@ def TensileCreateLibrary(): solutions = [] logicList = [] for logicFileName in logicFiles: - (scheduleName, problemType, solutionsForType, skinnyLogic0, skinnyLogic1, \ - diagonalLogic) = YAMLIO.readLibraryLogicForProblemType(logicFileName) + (scheduleName, problemType, solutionsForType, indexOrder, logic) \ + = YAMLIO.readLibraryLogicForProblemType(logicFileName) logicList.append((scheduleName, problemType, solutionsForType, \ - skinnyLogic0, skinnyLogic1, diagonalLogic)) + indexOrder, logic )) for solution in solutionsForType: if solution not in solutions: solutions.append(solution) diff --git a/Tensile/YAMLIO.py b/Tensile/YAMLIO.py index c0f7393b3..a806bd54e 100644 --- a/Tensile/YAMLIO.py +++ b/Tensile/YAMLIO.py @@ -85,27 +85,52 @@ def readSolutions( filename ): # 1 yaml per problem type # problemType, skinny0, skinny1, diagonal ################################################################################ -def writeLibraryLogicForProblemType( filePath, schedulePrefix, logic): - problemType = logic[0] - solutions = logic[1] - skinnyLogic0 = logic[2] - skinnyLogic1 = logic[3] - diagonalLogic = logic[4] +def writeLibraryLogicForProblemType( filePath, schedulePrefix, logicTuple): + problemType = logicTuple[0] + solutions = logicTuple[1] + indexOrder = logicTuple[2] + logic = logicTuple[3] filename = os.path.join(filePath, "%s_%s.yaml" \ % (schedulePrefix, str(problemType))) print2("# writeLogic( %s )" % ( filename )) - # open file + data = [] + # logic name + data.append(globalParameters["Name"]) + # problem type + problemTypeState = problemType.state + problemTypeState["DataType"] = \ + problemTypeState["DataType"].value + data.append(problemTypeState) + # solutions + solutionList = [] + for solution in solutions: + solutionState = solution.state + solutionState["ProblemType"] = solutionState["ProblemType"].state + solutionState["ProblemType"]["DataType"] = \ + solutionState["ProblemType"]["DataType"].value + solutionList.append(solutionState) + data.append(solutionList) + # index order + data.append(indexOrder) + # logic + data.append(logic) + + # open & write file try: stream = open(filename, "w") + #yaml.dump(data, stream, default_flow_style=False) + yaml.dump(data, stream) + stream.close() except IOError: printExit("Cannot open file: %s" % filename) + """ + #data = [ globalParameters["Name"], problemTypeState, [], [], [] ] # write problem type problemTypeState = problemType.state problemTypeState["DataType"] = \ problemTypeState["DataType"].value - data = [ globalParameters["Name"], problemTypeState, [], [], [], [] ] for solution in solutions: solutionState = solution.state solutionState["ProblemType"] = solutionState["ProblemType"].state @@ -122,6 +147,7 @@ def writeLibraryLogicForProblemType( filePath, schedulePrefix, logic): #stream.write(data) yaml.dump(data, stream, default_flow_style=False) stream.close() + """ def readLibraryLogicForProblemType( filename ): @@ -134,19 +160,20 @@ def readLibraryLogicForProblemType( filename ): stream.close() # verify - if len(data) < 6: + if len(data) < 5: printExit("len(%s) %u < 6" % (filename, len(data))) # parse out objects scheduleName = data[0] problemTypeState = data[1] solutionStates = data[2] - skinnyLogic0 = data[3] - skinnyLogic1 = data[4] - diagonalLogic = data[5] + indexOrder = data[3] + logic = data[4] - solutions = [] + # unpack problemType problemType = ProblemType(problemTypeState) + # unpack solutions + solutions = [] for i in range(0, len(solutionStates)): solutionState = solutionStates[i] solutionObject = Solution(solutionState) @@ -155,5 +182,4 @@ def readLibraryLogicForProblemType( filename ): % (problemType, solutionObject["ProblemType"])) solutions.append(solutionObject) - return (scheduleName, problemType, solutions, skinnyLogic0, skinnyLogic1, \ - diagonalLogic) + return (scheduleName, problemType, solutions, indexOrder, logic ) diff --git a/Tensile/__init__.py b/Tensile/__init__.py index 58039f505..8a124bf64 100644 --- a/Tensile/__init__.py +++ b/Tensile/__init__.py @@ -1 +1 @@ -__version__ = "2.1.1" +__version__ = "2.2.0"