From 12354124024873562a8cb578b9c24e4de54c2086 Mon Sep 17 00:00:00 2001 From: David Tanner Date: Fri, 24 Feb 2017 09:31:13 -0600 Subject: [PATCH 01/21] fixing innocuous indentation error --- Tensile/SolutionWriter.py | 30 +++++++++++++++--------------- 1 file changed, 15 insertions(+), 15 deletions(-) diff --git a/Tensile/SolutionWriter.py b/Tensile/SolutionWriter.py index c09b72d70..289bd1940 100644 --- a/Tensile/SolutionWriter.py +++ b/Tensile/SolutionWriter.py @@ -830,21 +830,21 @@ def solutionParametersConsistent(solution): else: solution["NumLoadsB"] = totalElementsB / numThreads - # how many loads para - if solution["NumLoadsA"] % solution["NumLoadsCoalescedA"] != 0: - if printReason: print2("numLoadsA %u %% numLoadsParaA %u != 0" \ - % (solution["NumLoadsA"], solution["NumLoadsCoalescedA"])) - return False - else: - solution["NumLoadsPerpendicularA"] = solution["NumLoadsA"] \ - / solution["NumLoadsCoalescedA"] - if solution["NumLoadsB"] % solution["NumLoadsCoalescedB"] != 0: - if printReason: print2("numLoadsB %u %% numLoadsParaB %u != 0" \ - % (solution["NumLoadsB"], solution["NumLoadsCoalescedB"])) - return False - else: - solution["NumLoadsPerpendicularB"] = solution["NumLoadsB"] \ - / solution["NumLoadsCoalescedB"] + # how many loads para + if solution["NumLoadsA"] % solution["NumLoadsCoalescedA"] != 0: + if printReason: print2("numLoadsA %u %% numLoadsParaA %u != 0" \ + % (solution["NumLoadsA"], solution["NumLoadsCoalescedA"])) + return False + else: + solution["NumLoadsPerpendicularA"] = solution["NumLoadsA"] \ + / solution["NumLoadsCoalescedA"] + if solution["NumLoadsB"] % solution["NumLoadsCoalescedB"] != 0: + if printReason: print2("numLoadsB %u %% numLoadsParaB %u != 0" \ + % (solution["NumLoadsB"], solution["NumLoadsCoalescedB"])) + return False + else: + solution["NumLoadsPerpendicularB"] = solution["NumLoadsB"] \ + / solution["NumLoadsCoalescedB"] # load size para/perp A if totalElementsParaA % solution["NumLoadsCoalescedA"] != 0: From 137cd977383f20a3bc11a37729fb7cd02bce6d42 Mon Sep 17 00:00:00 2001 From: David Tanner Date: Fri, 24 Feb 2017 10:18:31 -0600 Subject: [PATCH 02/21] fixing solution validity and derived parameters --- Tensile/BenchmarkProblems.py | 14 +++- Tensile/SolutionStructs.py | 145 ++++++++++++++++++++++++++++++++++- Tensile/SolutionWriter.py | 4 + 3 files changed, 156 insertions(+), 7 deletions(-) diff --git a/Tensile/BenchmarkProblems.py b/Tensile/BenchmarkProblems.py index 8ec4ed719..4e1c3f932 100644 --- a/Tensile/BenchmarkProblems.py +++ b/Tensile/BenchmarkProblems.py @@ -169,8 +169,14 @@ def benchmarkProblemType( config ): benchmarkStep.initialSolutionParameters[initialSolutionParameterName] # TODO check if solution matches problem size for exact tile kernels solutionObject = Solution(solution) - if SolutionWriter.solutionParametersConsistent(solutionObject): - solutions[hardcodedIdx].append(solutionObject) + if solutionObject["Valid"]: + print str(solutionObject), " LOOKING" + for s in solutions: + print s + if solutionObject not in solutions: + solutions[hardcodedIdx].append(solutionObject) + else: + print "SOLUTION ALREADY IN SOLUTIONS" if globalParameters["PrintLevel"] >= 1: sys.stdout.write("|") else: @@ -541,8 +547,8 @@ def get( lookupHardcodedParameters, winners ): #for paramName in hardcodedFrozen: # paramValue = hardcodedFrozen[paramName] # matchUnion[paramName] = paramValue - Solution.assignDimsFromEdgeAndShape(matchUnion) - Solution.assignDimsFromEdgeAndShape(hardcodedFrozen.parameters) + Solution.assignDerivedParameters(matchUnion) + Solution.assignDerivedParameters(hardcodedFrozen.parameters) if matchUnion["MacroTile0"] != lookupMacroTile0 \ or matchUnion["MacroTile1"] != lookupMacroTile1: matchMacroTile = False diff --git a/Tensile/SolutionStructs.py b/Tensile/SolutionStructs.py index b21f98728..e1fd318ca 100644 --- a/Tensile/SolutionStructs.py +++ b/Tensile/SolutionStructs.py @@ -505,7 +505,7 @@ def __init__(self, config): if key != "ProblemType" and key not in self.state: self.state[key] = config[key] - Solution.assignDimsFromEdgeAndShape(self.state) + Solution.assignDerivedParameters(self.state) ######################################## # get a list of kernel parameters for this solution @@ -531,7 +531,8 @@ def getKernels(self): ######################################## # assign Dim0, 1 based on edge and shape @staticmethod - def assignDimsFromEdgeAndShape(state): + def assignDerivedParameters(state): + # workgroup sizes state["WorkGroup0"] = state["WorkGroupEdge"] state["WorkGroup1"] = state["WorkGroupEdge"] @@ -556,6 +557,143 @@ def assignDimsFromEdgeAndShape(state): if "SplitU" in state and "LoopUnroll" in state: state["DepthU"] = state["SplitU"] * state["LoopUnroll"] + printReason = False + + # num threads + state["NumThreads"] = state["WorkGroup0"]*state["WorkGroup1"] + if state["NumThreads"] > globalParameters["MaxThreads"]: + if printReason: print2("rejecting %u threads" % state["NumThreads"]) + state["Valid"] = False + return + + # how many elements to load + if state["ProblemType"]["TLUA"]: + totalElementsParaA = state["MacroTile0"] + totalElementsPerpA = state["LoopUnroll"] + else: + totalElementsParaA = state["LoopUnroll"] + totalElementsPerpA = state["MacroTile0"] + + if state["ProblemType"]["TLUB"]: + totalElementsParaB = state["MacroTile1"] + totalElementsPerpB = state["LoopUnroll"] + else: + totalElementsParaB = state["LoopUnroll"] + totalElementsPerpB = state["MacroTile1"] + totalElementsA = totalElementsParaA * totalElementsPerpA + totalElementsB = totalElementsParaB * totalElementsPerpB + + # how many load instructions + if totalElementsA % state["NumThreads"] != 0: + if printReason: print2("totalElementsA %u %% NumThreads %u != 0" \ + % (totalElementsA, state["NumThreads"])) + state["Valid"] = False + return + else: + state["NumLoadsA"] = totalElementsA / state["NumThreads"] + if totalElementsB % state["NumThreads"] != 0: + if printReason: print2("totalElementsB %u %% NumThreads %u != 0" \ + % (totalElementsB, state["NumThreads"])) + state["Valid"] = False + return + else: + state["NumLoadsB"] = totalElementsB / state["NumThreads"] + + # how many loads para + if state["NumLoadsCoalescedA"] < 1: + state["NumLoadsCoalescedA"] = state["NumLoadsA"] + print "Assigning NLCA=%u" % state["NumLoadsA"] + if state["NumLoadsA"] % state["NumLoadsCoalescedA"] != 0: + if printReason: print2("numLoadsA %u %% numLoadsParaA %u != 0" \ + % (state["NumLoadsA"], state["NumLoadsCoalescedA"])) + state["Valid"] = False + return + else: + state["NumLoadsPerpendicularA"] = state["NumLoadsA"] \ + / state["NumLoadsCoalescedA"] + if state["NumLoadsCoalescedB"] < 1: + state["NumLoadsCoalescedB"] = state["NumLoadsB"] + print "Assigning NLCB=%u" % state["NumLoadsB"] + if state["NumLoadsB"] % state["NumLoadsCoalescedB"] != 0: + if printReason: print2("numLoadsB %u %% numLoadsParaB %u != 0" \ + % (state["NumLoadsB"], state["NumLoadsCoalescedB"])) + state["Valid"] = False + return + else: + state["NumLoadsPerpendicularB"] = state["NumLoadsB"] \ + / state["NumLoadsCoalescedB"] + + # load size para/perp A + if totalElementsParaA % state["NumLoadsCoalescedA"] != 0: + if printReason: print2("totalElementsParaA %u %% numLoadsParaA %u != 0" \ + % (totalElementsParaA, state["NumLoadsCoalescedA"])) + state["Valid"] = False + return + #else: + # loadSizeParaA = totalElementsParaA / state["NumLoadsCoalescedA"] + if totalElementsPerpA % state["NumLoadsPerpendicularA"] != 0: + if printReason: print2("totalElementsPerpA %u %% numLoadsPerpA %u != 0" \ + % (totalElementsPerpA, state["NumLoadsPerpendicularA"])) + state["Valid"] = False + return + #else: + # loadSizePerpA = totalElementsPerpA / state["NumLoadsPerpendicularA"] + + # load size para/perp B + if totalElementsParaB % state["NumLoadsCoalescedB"] != 0: + if printReason: print2("totalElementsParaB %u %% numLoadsParaB %u != 0" \ + % (totalElementsParaB, state["NumLoadsCoalescedB"])) + state["Valid"] = False + return + #else: + # loadSizeParaB = totalElementsParaB / state["NumLoadsCoalescedB"] + if totalElementsPerpB % state["NumLoadsPerpendicularB"] != 0: + if printReason: print2("totalElementsPerpB %u %% numLoadsPerpB %u != 0" \ + % (totalElementsPerpB, state["NumLoadsPerpendicularB"])) + state["Valid"] = False + return + #else: + # loadSizePerpB = totalElementsPerpB / state["NumLoadsPerpendicularB"] + + # too much LDS + sizeLDS = state["LoopUnroll"] \ + * (state["PadLDS"] * 2 + state["MacroTile0"] \ + + state["MacroTile1"] ) \ + * state["ProblemType"]["DataType"].numBytes() + if sizeLDS > globalParameters["MaxLDS"]: + if printReason: print2("Kernel Uses %u > %u bytes" % ( sizeLDS, globalParameters["MaxLDS"])) + state["Valid"] = False + return + + # Compiler may be causing incorrect spills on ROCm1.4 from DT on 2/21/17 + if globalParameters["Backend"] == "HIP": + if state["ProblemType"]["DataType"].value == DataType.single: + if state["MacroTile0"] == 128 or state["MacroTile1"] == 128: + if state["NumLoadsCoalescedA"] != 1 and state["NumLoadsCoalescedB"] != 8: + state["Valid"] = False + return + elif state["ProblemType"]["DataType"].value == DataType.double: + if globalParameters["Backend"] == "HIP": + if state["MacroTile0"] >= 64 or state["MacroTile1"] >= 64: + state["Valid"] = False + return + + state["Valid"] = True + +# validation failures +# Cijk_Ailk_Bjlk_SB_DU16_LU16_MT064_MT164_NLA16_NLB16_NLCA02_NLCB01_NLPA08_NLPB16_TT008_TT108_TTE08_WG008_WG108_WGE08 +# Cijk_Ailk_Bjlk_SB_DU16_LU16_MT064_MT164_NLA16_NLB16_NLCA04_NLCB02_NLPA04_NLPB08_TT008_TT108_TTE08_WG008_WG108_WGE08 +# Cijk_Ailk_Bjlk_SB_DU16_LU16_MT064_MT164_NLA16_NLB16_NLCA02_NLCB04_NLPA08_NLPB04_TT008_TT108_TTE08_WG008_WG108_WGE08 + +# Cijk_Ailk_Bjlk_DB_DU16_LU16_MT064_MT164_NLA16_NLB16_NLCA04_NLCB01_NLPA04_NLPB16_TT008_TT108_TTE08_WG008_WG108_WGE08 +# Cijk_Ailk_Bjlk_DB_DU08_LU08_MT064_MT164_NLA08_NLB08_NLCA01_NLCB01_NLPA08_NLPB08_TT008_TT108_TTE08_WG008_WG108_WGE08 +# Cijk_Ailk_Bjlk_DB_DU08_LU08_MT064_MT164_NLA08_NLB08_NLCA08_NLCB01_NLPA01_NLPB08_TT008_TT108_TTE08_WG008_WG108_WGE08 +# Cijk_Ailk_Bjlk_DB_DU08_LU08_MT064_MT164_NLA08_NLB08_NLCA08_NLCB08_NLPA01_NLPB01_TT008_TT108_TTE08_WG008_WG108_WGE08 +# Cijk_Ailk_Bjlk_DB_DU16_LU16_MT064_MT164_NLA16_NLB16_NLCA08_NLCB08_NLPA02_NLPB02_TT008_TT108_TTE08_WG008_WG108_WGE08 +# Cijk_Ailk_Bjlk_DB_DU08_LU08_MT064_MT164_NLA08_NLB08_NLCA01_NLCB08_NLPA08_NLPB01_TT008_TT108_TTE08_WG008_WG108_WGE08 + + + ######################################## # create a dictionary with booleans on whether to include parameter in name @@ -746,7 +884,8 @@ def __hash__(self): return hash(str(self)) #return hash(self.getAttributes()) def __eq__(self, other): - return isinstance(other, Solution) and self.getAttributes() == other.getAttributes() + #return isinstance(other, Solution) and self.getAttributes() == other.getAttributes() + return isinstance(other, Solution) and str(self) == str(other) def __ne__(self, other): result = self.__eq__(other) if result is NotImplemented: diff --git a/Tensile/SolutionWriter.py b/Tensile/SolutionWriter.py index 289bd1940..452c9c45e 100644 --- a/Tensile/SolutionWriter.py +++ b/Tensile/SolutionWriter.py @@ -831,6 +831,8 @@ def solutionParametersConsistent(solution): solution["NumLoadsB"] = totalElementsB / numThreads # how many loads para + if solution["NumLoadsCoalescedA"] < 1: + solution["NumLoadsCoalescedA"] = solution["NumLoadsA"] if solution["NumLoadsA"] % solution["NumLoadsCoalescedA"] != 0: if printReason: print2("numLoadsA %u %% numLoadsParaA %u != 0" \ % (solution["NumLoadsA"], solution["NumLoadsCoalescedA"])) @@ -838,6 +840,8 @@ def solutionParametersConsistent(solution): else: solution["NumLoadsPerpendicularA"] = solution["NumLoadsA"] \ / solution["NumLoadsCoalescedA"] + if solution["NumLoadsCoalescedB"] < 1: + solution["NumLoadsCoalescedB"] = solution["NumLoadsB"] if solution["NumLoadsB"] % solution["NumLoadsCoalescedB"] != 0: if printReason: print2("numLoadsB %u %% numLoadsParaB %u != 0" \ % (solution["NumLoadsB"], solution["NumLoadsCoalescedB"])) From d795ca6e65501423553b6b760f2ab9b0000b60d9 Mon Sep 17 00:00:00 2001 From: David Tanner Date: Fri, 24 Feb 2017 12:34:33 -0600 Subject: [PATCH 03/21] fixed NumLoadsCoalesced=-1, assigningDerrivedParameters --- Tensile/BenchmarkProblems.py | 18 +-- Tensile/SolutionStructs.py | 213 +++++++++++++++++++---------------- Tensile/__init__.py | 2 +- 3 files changed, 123 insertions(+), 110 deletions(-) diff --git a/Tensile/BenchmarkProblems.py b/Tensile/BenchmarkProblems.py index 4e1c3f932..cd0bbc575 100644 --- a/Tensile/BenchmarkProblems.py +++ b/Tensile/BenchmarkProblems.py @@ -37,7 +37,7 @@ def benchmarkProblemType( config ): totalBenchmarkSteps = len(benchmarkProcess) winners = WinningParameterDict() - print1("NumBenchmarkSteps: %u" % totalBenchmarkSteps) + print1("# NumBenchmarkSteps: %u" % totalBenchmarkSteps) print1("") print1(HR) print1("# Done Creating BenchmarkProcess Object") @@ -170,13 +170,13 @@ def benchmarkProblemType( config ): # TODO check if solution matches problem size for exact tile kernels solutionObject = Solution(solution) if solutionObject["Valid"]: - print str(solutionObject), " LOOKING" - for s in solutions: - print s - if solutionObject not in solutions: + hasSolution = False + for hardcodedSolutions in solutions: + for hardcodedSolution in hardcodedSolutions: + if hardcodedSolution == solutionObject: + hasSolution = True + if not hasSolution: solutions[hardcodedIdx].append(solutionObject) - else: - print "SOLUTION ALREADY IN SOLUTIONS" if globalParameters["PrintLevel"] >= 1: sys.stdout.write("|") else: @@ -547,8 +547,8 @@ def get( lookupHardcodedParameters, winners ): #for paramName in hardcodedFrozen: # paramValue = hardcodedFrozen[paramName] # matchUnion[paramName] = paramValue - Solution.assignDerivedParameters(matchUnion) - Solution.assignDerivedParameters(hardcodedFrozen.parameters) + Solution.assignProblemIndependentDerivedParameters(matchUnion) + Solution.assignProblemIndependentDerivedParameters(hardcodedFrozen.parameters) if matchUnion["MacroTile0"] != lookupMacroTile0 \ or matchUnion["MacroTile1"] != lookupMacroTile1: matchMacroTile = False diff --git a/Tensile/SolutionStructs.py b/Tensile/SolutionStructs.py index e1fd318ca..5f4eff66e 100644 --- a/Tensile/SolutionStructs.py +++ b/Tensile/SolutionStructs.py @@ -95,9 +95,9 @@ def zeroString(self, backend): zeroString = "(%s)(" % self.toDevice(backend) if self.value == self.single or self.value == self.half: zeroString += "0.f" - elif self.value == self.double: + elif self.value == self.double: zeroString += "0.0" - elif self.value == self.complexSingle: + elif self.value == self.complexSingle: zeroString += "0.f, 0.f" elif self.value == self.complexDouble: zeroString += "0.0, 0.0" @@ -214,7 +214,8 @@ def __init__(self, config): elif self["OperationType"] == "TensorContraction": self.initTensorContraction(config) - self.assignIndices() + self.state["AssignedDerivedParameters"] = False + ProblemType.assignDerivedParameters(self.state) ######################################## @@ -249,90 +250,98 @@ def isTensorContraction(self): ######################################## # determine d0, d1, dU - def assignIndices(self): - self["TotalIndices"] = max(max(self["IndexAssignmentsA"])+1, max(self["IndexAssignmentsB"])+1) + @staticmethod + def assignDerivedParameters(state): + if "AssignedDerivedParameters" in state: + if state["AssignedDerivedParameters"]: + return + state["AssignedDerivedParameters"] = False + + state["TotalIndices"] = max(max(state["IndexAssignmentsA"])+1, \ + max(state["IndexAssignmentsB"])+1) # determine num free, batch - self["IndicesFree"] = [] - self["IndicesBatch"] = [] - self["IndicesSummation"] = [] + state["IndicesFree"] = [] + state["IndicesBatch"] = [] + state["IndicesSummation"] = [] - for i in range(0, self["NumIndicesC"]): - inA = i in self["IndexAssignmentsA"] - inB = i in self["IndexAssignmentsB"] + for i in range(0, state["NumIndicesC"]): + inA = i in state["IndexAssignmentsA"] + inB = i in state["IndexAssignmentsB"] if inA and inB: - #self["NumIndicesBatch"] = (i+1)-self["NumIndicesFree"] - self["IndicesBatch"].append(i) + #state["NumIndicesBatch"] = (i+1)-state["NumIndicesFree"] + state["IndicesBatch"].append(i) elif inA or inB: - #self["NumIndicesFree"] = (i+1) - self["IndicesFree"].append(i) + #state["NumIndicesFree"] = (i+1) + state["IndicesFree"].append(i) else: printExit("invalid index %u" % i) # determine num summation - for i in range(self["NumIndicesC"], self["TotalIndices"]): - inA = i in self["IndexAssignmentsA"] - inB = i in self["IndexAssignmentsB"] + for i in range(state["NumIndicesC"], state["TotalIndices"]): + inA = i in state["IndexAssignmentsA"] + inB = i in state["IndexAssignmentsB"] if inA and inB: - #self["NumIndicesSummation"] = (i+1)-self["NumIndicesC"] - self.state["IndicesSummation"].append(i) + #state["NumIndicesSummation"] = (i+1)-state["NumIndicesC"] + state["IndicesSummation"].append(i) else: printExit("invalid index %u" % i) - self["NumIndicesFree"] = len(self["IndicesFree"]) - self["NumIndicesBatch"] = len(self["IndicesBatch"]) - self["NumIndicesSummation"] = len(self["IndicesSummation"]) + state["NumIndicesFree"] = len(state["IndicesFree"]) + state["NumIndicesBatch"] = len(state["IndicesBatch"]) + state["NumIndicesSummation"] = len(state["IndicesSummation"]) # by default, unroll index will be the first summation index # TODO sort summation indices by "stride" - self["IndexUnroll"] = self["IndicesSummation"][0] - for i in range(0, len(self["IndexAssignmentsA"])): - if self["IndexAssignmentsA"][i] == self["IndexUnroll"]: - self["IndexUnrollA"] = i + state["IndexUnroll"] = state["IndicesSummation"][0] + for i in range(0, len(state["IndexAssignmentsA"])): + if state["IndexAssignmentsA"][i] == state["IndexUnroll"]: + state["IndexUnrollA"] = i break - for i in range(0, len(self["IndexAssignmentsB"])): - if self["IndexAssignmentsB"][i] == self["IndexUnroll"]: - self["IndexUnrollB"] = i + for i in range(0, len(state["IndexAssignmentsB"])): + if state["IndexAssignmentsB"][i] == state["IndexUnroll"]: + state["IndexUnrollB"] = i break # assign d0, d1 - self["Index01A"] = -1 - self["Index01B"] = -1 - for i in self["IndexAssignmentsA"]: - if i < self["NumIndicesC"]: - self["Index01A"] = i + state["Index01A"] = -1 + state["Index01B"] = -1 + for i in state["IndexAssignmentsA"]: + if i < state["NumIndicesC"]: + state["Index01A"] = i break - for i in self["IndexAssignmentsB"]: - if i < self["NumIndicesC"]: - self["Index01B"] = i + for i in state["IndexAssignmentsB"]: + if i < state["NumIndicesC"]: + state["Index01B"] = i break # whichever has lower stride in C (lower value), is 0, other is 1 - if self["Index01A"] < self["Index01B"]: - self["Index0"] = self["Index01A"] - self["Index1"] = self["Index01B"] - self["Tensor0"] = 0 - self["Tensor1"] = 1 - self["TileA"] = 0 - self["TileB"] = 1 + if state["Index01A"] < state["Index01B"]: + state["Index0"] = state["Index01A"] + state["Index1"] = state["Index01B"] + state["Tensor0"] = 0 + state["Tensor1"] = 1 + state["TileA"] = 0 + state["TileB"] = 1 else: - self["Index0"] = self["Index01B"] - self["Index1"] = self["Index01A"] - self["Tensor0"] = 1 - self["Tensor1"] = 0 - self["TileA"] = 1 - self["TileB"] = 0 + state["Index0"] = state["Index01B"] + state["Index1"] = state["Index01A"] + state["Tensor0"] = 1 + state["Tensor1"] = 0 + state["TileA"] = 1 + state["TileB"] = 0 # generalize transpose - strideIdxA = self["IndexAssignmentsA"].index(self["Index01A"]) - strideIdxB = self["IndexAssignmentsB"].index(self["Index01B"]) - unrollIdxA = self["IndexAssignmentsA"].index(self["IndexUnroll"]) - unrollIdxB = self["IndexAssignmentsB"].index(self["IndexUnroll"]) - self["TLUA"] = strideIdxA < unrollIdxA - self["TLUB"] = strideIdxB < unrollIdxB + strideIdxA = state["IndexAssignmentsA"].index(state["Index01A"]) + strideIdxB = state["IndexAssignmentsB"].index(state["Index01B"]) + unrollIdxA = state["IndexAssignmentsA"].index(state["IndexUnroll"]) + unrollIdxB = state["IndexAssignmentsB"].index(state["IndexUnroll"]) + state["TLUA"] = strideIdxA < unrollIdxA + state["TLUB"] = strideIdxB < unrollIdxB #unrollDimStrideGreaterThanTileDimStrideA = TLUA #unrollDimStrideLessThanTileDimStrideB = !TLUB + state["AssignedDerivedParameters"] = True @@ -364,6 +373,18 @@ def __str__(self): if self["UseInitialStrides"]: name += "I" return name + def keys(self): + return self.state.keys() + def __len__(self): + return len(self.state) + def __iter__(self): + return iter(self.state) + + + + + + def __getitem__(self, key): return self.state[key] def __setitem__(self, key, value): @@ -504,7 +525,9 @@ def __init__(self, config): for key in config: if key != "ProblemType" and key not in self.state: self.state[key] = config[key] - + self["Valid"] = True + self["AssignedProblemIndependentDerivedParameters"] = False + self["AssignedDerivedParameters"] = False Solution.assignDerivedParameters(self.state) ######################################## @@ -529,10 +552,13 @@ def getKernels(self): ######################################## - # assign Dim0, 1 based on edge and shape + # assign tile sizes @staticmethod - def assignDerivedParameters(state): - + def assignProblemIndependentDerivedParameters(state): + if "AssignedProblemIndependentDerivedParameters" in state: + if state["AssignedProblemIndependentDerivedParameters"]: + return + state["AssignedProblemIndependentDerivedParameters"] = False # workgroup sizes state["WorkGroup0"] = state["WorkGroupEdge"] state["WorkGroup1"] = state["WorkGroupEdge"] @@ -557,14 +583,25 @@ def assignDerivedParameters(state): if "SplitU" in state and "LoopUnroll" in state: state["DepthU"] = state["SplitU"] * state["LoopUnroll"] - printReason = False - # num threads state["NumThreads"] = state["WorkGroup0"]*state["WorkGroup1"] if state["NumThreads"] > globalParameters["MaxThreads"]: if printReason: print2("rejecting %u threads" % state["NumThreads"]) state["Valid"] = False - return + state["AssignedProblemIndependentDerivedParameters"] = True + + ######################################## + # assign all derived parameters + @staticmethod + def assignDerivedParameters(state): + Solution.assignProblemIndependentDerivedParameters(state) + if "AssignedDerivedParameters" in state: + if state["AssignedDerivedParameters"]: + return + state["AssignedDerivedParameters"] = False + + ProblemType.assignDerivedParameters(state["ProblemType"]) + printReason = False # how many elements to load if state["ProblemType"]["TLUA"]: @@ -588,37 +625,35 @@ def assignDerivedParameters(state): if printReason: print2("totalElementsA %u %% NumThreads %u != 0" \ % (totalElementsA, state["NumThreads"])) state["Valid"] = False - return + #return else: state["NumLoadsA"] = totalElementsA / state["NumThreads"] if totalElementsB % state["NumThreads"] != 0: if printReason: print2("totalElementsB %u %% NumThreads %u != 0" \ % (totalElementsB, state["NumThreads"])) state["Valid"] = False - return + #return else: state["NumLoadsB"] = totalElementsB / state["NumThreads"] # how many loads para if state["NumLoadsCoalescedA"] < 1: state["NumLoadsCoalescedA"] = state["NumLoadsA"] - print "Assigning NLCA=%u" % state["NumLoadsA"] if state["NumLoadsA"] % state["NumLoadsCoalescedA"] != 0: if printReason: print2("numLoadsA %u %% numLoadsParaA %u != 0" \ % (state["NumLoadsA"], state["NumLoadsCoalescedA"])) state["Valid"] = False - return + #return else: state["NumLoadsPerpendicularA"] = state["NumLoadsA"] \ / state["NumLoadsCoalescedA"] if state["NumLoadsCoalescedB"] < 1: state["NumLoadsCoalescedB"] = state["NumLoadsB"] - print "Assigning NLCB=%u" % state["NumLoadsB"] if state["NumLoadsB"] % state["NumLoadsCoalescedB"] != 0: if printReason: print2("numLoadsB %u %% numLoadsParaB %u != 0" \ % (state["NumLoadsB"], state["NumLoadsCoalescedB"])) state["Valid"] = False - return + #return else: state["NumLoadsPerpendicularB"] = state["NumLoadsB"] \ / state["NumLoadsCoalescedB"] @@ -628,14 +663,14 @@ def assignDerivedParameters(state): if printReason: print2("totalElementsParaA %u %% numLoadsParaA %u != 0" \ % (totalElementsParaA, state["NumLoadsCoalescedA"])) state["Valid"] = False - return + #return #else: # loadSizeParaA = totalElementsParaA / state["NumLoadsCoalescedA"] if totalElementsPerpA % state["NumLoadsPerpendicularA"] != 0: if printReason: print2("totalElementsPerpA %u %% numLoadsPerpA %u != 0" \ % (totalElementsPerpA, state["NumLoadsPerpendicularA"])) state["Valid"] = False - return + #return #else: # loadSizePerpA = totalElementsPerpA / state["NumLoadsPerpendicularA"] @@ -644,14 +679,14 @@ def assignDerivedParameters(state): if printReason: print2("totalElementsParaB %u %% numLoadsParaB %u != 0" \ % (totalElementsParaB, state["NumLoadsCoalescedB"])) state["Valid"] = False - return + #return #else: # loadSizeParaB = totalElementsParaB / state["NumLoadsCoalescedB"] if totalElementsPerpB % state["NumLoadsPerpendicularB"] != 0: if printReason: print2("totalElementsPerpB %u %% numLoadsPerpB %u != 0" \ % (totalElementsPerpB, state["NumLoadsPerpendicularB"])) state["Valid"] = False - return + #return #else: # loadSizePerpB = totalElementsPerpB / state["NumLoadsPerpendicularB"] @@ -663,7 +698,7 @@ def assignDerivedParameters(state): if sizeLDS > globalParameters["MaxLDS"]: if printReason: print2("Kernel Uses %u > %u bytes" % ( sizeLDS, globalParameters["MaxLDS"])) state["Valid"] = False - return + #return # Compiler may be causing incorrect spills on ROCm1.4 from DT on 2/21/17 if globalParameters["Backend"] == "HIP": @@ -671,14 +706,14 @@ def assignDerivedParameters(state): if state["MacroTile0"] == 128 or state["MacroTile1"] == 128: if state["NumLoadsCoalescedA"] != 1 and state["NumLoadsCoalescedB"] != 8: state["Valid"] = False - return + #return elif state["ProblemType"]["DataType"].value == DataType.double: if globalParameters["Backend"] == "HIP": if state["MacroTile0"] >= 64 or state["MacroTile1"] >= 64: state["Valid"] = False - return + #return + state["AssignedDerivedParameters"] = True - state["Valid"] = True # validation failures # Cijk_Ailk_Bjlk_SB_DU16_LU16_MT064_MT164_NLA16_NLB16_NLCA02_NLCB01_NLPA08_NLPB16_TT008_TT108_TTE08_WG008_WG108_WGE08 @@ -694,7 +729,6 @@ def assignDerivedParameters(state): - ######################################## # create a dictionary with booleans on whether to include parameter in name @staticmethod @@ -755,8 +789,6 @@ def getNameMin(state, requiredParameters): first = False name += "%s%s" % ( Solution.getParameterNameAbbreviation(key), \ Solution.getParameterValueAbbreviation(state[key]) ) - #else: - # print "%s not in %s" % (key, requiredParameters) return name ######################################## @@ -765,26 +797,20 @@ def getNameMin(state, requiredParameters): def getSerialNaming(objs): data = {} for objIdx in range(0, len(objs)): - #print "ObjIdx: %u" % objIdx obj = objs[objIdx] for paramName in sorted(obj.keys()): if paramName not in derrivedParameters: paramValue = obj[paramName] - #if paramName == "ThreadTileEdge": - # print "%s = %s" % (paramName, paramValue) if paramName in data: if paramValue not in data[paramName]: data[paramName].append(paramValue) else: data[paramName] = [ paramValue ] maxObjs = 1 - #print "SerialNaming:" for paramName in data: data[paramName] = sorted(data[paramName]) - #print "%s: %s" % (paramName, data[paramName]) maxObjs *= len(data[paramName]) numDigits = len(str(maxObjs)) - #print "MaxSerialNames: %u (%u)" % (maxObjs, numDigits) return [ data, numDigits ] ######################################## @@ -803,23 +829,10 @@ def getNameSerial(state, serialNaming): paramNameMultiplier = len(paramData) if paramValue in paramData: paramValueIdx = paramData.index(paramValue) - #else: - #print "ERROR %s: %s not in %s" % ( paramName, paramValue, paramData ) - #print state - #printExit() - #if paramNameMultiplier > 1: - #print "serial = %u*%u + %u; multiplier = %u * %u; %s::%s in %s" % ( \ - # paramValueIdx, multiplier, serial, \ - # paramNameMultiplier, multiplier, \ - # paramName, paramValue, paramData[1] ) - serial += paramValueIdx * multiplier multiplier *= paramNameMultiplier - #if serial == 0: - # print state name = "%s%0*u" % ("S" if isinstance(state, Solution) else "K", \ numDigits, serial) - #print "SerialName: %s" % name return name diff --git a/Tensile/__init__.py b/Tensile/__init__.py index 58039f505..4eabd0b3f 100644 --- a/Tensile/__init__.py +++ b/Tensile/__init__.py @@ -1 +1 @@ -__version__ = "2.1.1" +__version__ = "2.1.2" From 443895ae7a02b0aa751cabda9e9f934707543518 Mon Sep 17 00:00:00 2001 From: David Tanner Date: Fri, 24 Feb 2017 12:48:15 -0600 Subject: [PATCH 04/21] fixed removing derived parameters from naming --- Tensile/Common.py | 5 +++-- Tensile/SolutionStructs.py | 38 +++++++++++++++++++++----------------- Tensile/__init__.py | 2 +- 3 files changed, 25 insertions(+), 20 deletions(-) diff --git a/Tensile/Common.py b/Tensile/Common.py index 8576813b2..254985c7a 100644 --- a/Tensile/Common.py +++ b/Tensile/Common.py @@ -96,8 +96,8 @@ {"VectorWidthLocalStore": [ 4 ] }, ] -# derrived parameters may show up in solution dict but don't use for naming -derrivedParameters = [ +# derived parameters may show up in solution dict but don't use for naming +derivedParameters = [ "MacroTile0", "MacroTile1", "WorkGroup0", @@ -108,6 +108,7 @@ "NumLoadsB", "NumLoadsPerpendicularA", "NumLoadsPerpendicularB", + "NumThreads", ] # dictionary of defaults comprised for 1st option for each parameter diff --git a/Tensile/SolutionStructs.py b/Tensile/SolutionStructs.py index 5f4eff66e..2f6b9d407 100644 --- a/Tensile/SolutionStructs.py +++ b/Tensile/SolutionStructs.py @@ -20,7 +20,7 @@ ################################################################################ -from Common import globalParameters, defaultProblemType, assignParameterWithDefault, printExit, assignParameterRequired, defaultSolution, derrivedParameters +from Common import globalParameters, defaultProblemType, assignParameterWithDefault, printExit, assignParameterRequired, defaultSolution, derivedParameters from copy import deepcopy ################################################################################ @@ -583,6 +583,7 @@ def assignProblemIndependentDerivedParameters(state): if "SplitU" in state and "LoopUnroll" in state: state["DepthU"] = state["SplitU"] * state["LoopUnroll"] + printReason = False # num threads state["NumThreads"] = state["WorkGroup0"]*state["WorkGroup1"] if state["NumThreads"] > globalParameters["MaxThreads"]: @@ -625,14 +626,15 @@ def assignDerivedParameters(state): if printReason: print2("totalElementsA %u %% NumThreads %u != 0" \ % (totalElementsA, state["NumThreads"])) state["Valid"] = False - #return + return else: state["NumLoadsA"] = totalElementsA / state["NumThreads"] if totalElementsB % state["NumThreads"] != 0: if printReason: print2("totalElementsB %u %% NumThreads %u != 0" \ % (totalElementsB, state["NumThreads"])) state["Valid"] = False - #return + return + state["NumLoadsB"] = totalElementsB / state["NumThreads"] else: state["NumLoadsB"] = totalElementsB / state["NumThreads"] @@ -643,7 +645,7 @@ def assignDerivedParameters(state): if printReason: print2("numLoadsA %u %% numLoadsParaA %u != 0" \ % (state["NumLoadsA"], state["NumLoadsCoalescedA"])) state["Valid"] = False - #return + return else: state["NumLoadsPerpendicularA"] = state["NumLoadsA"] \ / state["NumLoadsCoalescedA"] @@ -653,7 +655,7 @@ def assignDerivedParameters(state): if printReason: print2("numLoadsB %u %% numLoadsParaB %u != 0" \ % (state["NumLoadsB"], state["NumLoadsCoalescedB"])) state["Valid"] = False - #return + return else: state["NumLoadsPerpendicularB"] = state["NumLoadsB"] \ / state["NumLoadsCoalescedB"] @@ -663,14 +665,14 @@ def assignDerivedParameters(state): if printReason: print2("totalElementsParaA %u %% numLoadsParaA %u != 0" \ % (totalElementsParaA, state["NumLoadsCoalescedA"])) state["Valid"] = False - #return + return #else: # loadSizeParaA = totalElementsParaA / state["NumLoadsCoalescedA"] if totalElementsPerpA % state["NumLoadsPerpendicularA"] != 0: if printReason: print2("totalElementsPerpA %u %% numLoadsPerpA %u != 0" \ % (totalElementsPerpA, state["NumLoadsPerpendicularA"])) state["Valid"] = False - #return + return #else: # loadSizePerpA = totalElementsPerpA / state["NumLoadsPerpendicularA"] @@ -679,14 +681,14 @@ def assignDerivedParameters(state): if printReason: print2("totalElementsParaB %u %% numLoadsParaB %u != 0" \ % (totalElementsParaB, state["NumLoadsCoalescedB"])) state["Valid"] = False - #return + return #else: # loadSizeParaB = totalElementsParaB / state["NumLoadsCoalescedB"] if totalElementsPerpB % state["NumLoadsPerpendicularB"] != 0: if printReason: print2("totalElementsPerpB %u %% numLoadsPerpB %u != 0" \ % (totalElementsPerpB, state["NumLoadsPerpendicularB"])) state["Valid"] = False - #return + return #else: # loadSizePerpB = totalElementsPerpB / state["NumLoadsPerpendicularB"] @@ -698,7 +700,7 @@ def assignDerivedParameters(state): if sizeLDS > globalParameters["MaxLDS"]: if printReason: print2("Kernel Uses %u > %u bytes" % ( sizeLDS, globalParameters["MaxLDS"])) state["Valid"] = False - #return + return # Compiler may be causing incorrect spills on ROCm1.4 from DT on 2/21/17 if globalParameters["Backend"] == "HIP": @@ -745,14 +747,16 @@ def getMinNaming(objs): # only 1, rather than name being nothing, it'll be everything if len(objs) == 1: for key in keys: - requiredParameters[key] = False + if key not in derivedParameters: + requiredParameters[key] = False else: for key in keys: required = False - for i in range(1, len(objs)): - if objs[0][key] != objs[i][key]: - required = True - break + if key not in derivedParameters: + for i in range(1, len(objs)): + if objs[0][key] != objs[i][key]: + required = True + break if required: requiredParameters[key] = True else: @@ -799,7 +803,7 @@ def getSerialNaming(objs): for objIdx in range(0, len(objs)): obj = objs[objIdx] for paramName in sorted(obj.keys()): - if paramName not in derrivedParameters: + if paramName not in derivedParameters: paramValue = obj[paramName] if paramName in data: if paramValue not in data[paramName]: @@ -823,7 +827,7 @@ def getNameSerial(state, serialNaming): serial = 0 multiplier = 1 for paramName in sorted(state.keys()): - if paramName not in derrivedParameters: + if paramName not in derivedParameters: paramValue = state[paramName] paramData = data[paramName] paramNameMultiplier = len(paramData) diff --git a/Tensile/__init__.py b/Tensile/__init__.py index 4eabd0b3f..e835b9d02 100644 --- a/Tensile/__init__.py +++ b/Tensile/__init__.py @@ -1 +1 @@ -__version__ = "2.1.2" +__version__ = "2.1.3" From 2e6e67d980c1e4c96fa3ef2642bbb44fd1dc9a0b Mon Sep 17 00:00:00 2001 From: David Tanner Date: Fri, 24 Feb 2017 14:14:28 -0600 Subject: [PATCH 05/21] nlca -1, 1 search for next valid parameter, set as default --- Tensile/BenchmarkProblems.py | 9 +- Tensile/Common.py | 8 +- Tensile/Configs/rocblas_cgemm.yaml | 16 +-- Tensile/Configs/rocblas_dgemm.yaml | 16 +-- Tensile/Configs/rocblas_sgemm.yaml | 16 +-- Tensile/Configs/rocblas_zgemm.yaml | 16 +-- Tensile/Configs/sgemm.yaml | 5 +- Tensile/Configs/sgemm_5760.yaml | 8 -- Tensile/Configs/tensor_contraction.yaml | 4 +- Tensile/SolutionStructs.py | 165 +++++++++++++++++++----- Tensile/__init__.py | 2 +- 11 files changed, 177 insertions(+), 88 deletions(-) diff --git a/Tensile/BenchmarkProblems.py b/Tensile/BenchmarkProblems.py index cd0bbc575..178acf213 100644 --- a/Tensile/BenchmarkProblems.py +++ b/Tensile/BenchmarkProblems.py @@ -175,10 +175,13 @@ def benchmarkProblemType( config ): for hardcodedSolution in hardcodedSolutions: if hardcodedSolution == solutionObject: hasSolution = True - if not hasSolution: + if hasSolution: + if globalParameters["PrintLevel"] >= 1: + sys.stdout.write(":") + else: solutions[hardcodedIdx].append(solutionObject) - if globalParameters["PrintLevel"] >= 1: - sys.stdout.write("|") + if globalParameters["PrintLevel"] >= 1: + sys.stdout.write("|") else: if globalParameters["PrintLevel"] >= 1: sys.stdout.write(".") diff --git a/Tensile/Common.py b/Tensile/Common.py index 254985c7a..2cf0e0906 100644 --- a/Tensile/Common.py +++ b/Tensile/Common.py @@ -67,6 +67,8 @@ {"EdgeType": [ "Branch" ] }, # Shift {"EdgeMultiKernel": [ False ] }, {"PadLDS": [ 1 ] }, + {"SplitU": [ 1 ] }, + {"Prefetch": [ False ] }, ] # benchmark these solution independently defaultForkParameters = [ @@ -74,8 +76,8 @@ {"WorkGroupShape": [ 0 ] }, # -1, 0, 1 {"ThreadTileEdge": [ 1, 2, 4, 6, 8 ] }, {"ThreadTileShape": [ 0 ] }, # -1, 0, 1 - {"SplitU": [ 1 ] }, - {"Prefetch": [ False ] }, + {"NumLoadsCoalescedA": [ 1, -1 ] }, + {"NumLoadsCoalescedB": [ 1, -1 ] }, ] # keep one winner per solution and it affects which will win defaultBenchmarkForkParameters = [ @@ -88,8 +90,6 @@ ] # keep one winner per solution and it would affect which solutions fastest defaultBenchmarkJoinParameters = [ - {"NumLoadsCoalescedA": [ 1, 2, 3, 4, 6, 8 ] }, - {"NumLoadsCoalescedB": [ 1, 2, 3, 4, 6, 8 ] }, {"VectorWidthGlobalLoad": [ 4 ] }, {"VectorWidthGlobalStore": [ 4 ] }, {"VectorWidthLocalLoad": [ 4 ] }, diff --git a/Tensile/Configs/rocblas_cgemm.yaml b/Tensile/Configs/rocblas_cgemm.yaml index b1c5e2257..3c0fbd958 100644 --- a/Tensile/Configs/rocblas_cgemm.yaml +++ b/Tensile/Configs/rocblas_cgemm.yaml @@ -41,8 +41,8 @@ BenchmarkProblems: - ThreadTileShape: [ 0 ] - WorkGroupEdge: [8, 16] - ThreadTileEdge: [2, 4, 6, 8] - - NumLoadsCoalescedA: [ 1, 2, 3, 4, 6, 8 ] - - NumLoadsCoalescedB: [ 1, 2, 3, 4, 6, 8 ] + - NumLoadsCoalescedA: [ 1, -1 ] + - NumLoadsCoalescedB: [ 1, -1 ] - LoopUnroll: [8, 16] BenchmarkForkParameters: JoinParameters: @@ -74,8 +74,8 @@ BenchmarkProblems: - WorkGroupShape: [ 0 ] - ThreadTileEdge: [2, 4, 6, 8] - ThreadTileShape: [ 0 ] - - NumLoadsCoalescedA: [ 1, 2, 3, 4, 6, 8 ] - - NumLoadsCoalescedB: [ 1, 2, 3, 4, 6, 8 ] + - NumLoadsCoalescedA: [ 1, -1 ] + - NumLoadsCoalescedB: [ 1, -1 ] - LoopUnroll: [8, 16] BenchmarkForkParameters: JoinParameters: @@ -107,8 +107,8 @@ BenchmarkProblems: - WorkGroupShape: [ 0 ] - ThreadTileEdge: [2, 4, 6, 8] - ThreadTileShape: [ 0 ] - - NumLoadsCoalescedA: [ 1, 2, 3, 4, 6, 8 ] - - NumLoadsCoalescedB: [ 1, 2, 3, 4, 6, 8 ] + - NumLoadsCoalescedA: [ 1, -1 ] + - NumLoadsCoalescedB: [ 1, -1 ] - LoopUnroll: [8, 16] BenchmarkForkParameters: JoinParameters: @@ -140,8 +140,8 @@ BenchmarkProblems: - WorkGroupShape: [ 0 ] - ThreadTileEdge: [2, 4, 6, 8] - ThreadTileShape: [ 0 ] - - NumLoadsCoalescedA: [ 1, 2, 3, 4, 6, 8 ] - - NumLoadsCoalescedB: [ 1, 2, 3, 4, 6, 8 ] + - NumLoadsCoalescedA: [ 1, -1 ] + - NumLoadsCoalescedB: [ 1, -1 ] - LoopUnroll: [8, 16] BenchmarkForkParameters: JoinParameters: diff --git a/Tensile/Configs/rocblas_dgemm.yaml b/Tensile/Configs/rocblas_dgemm.yaml index 36e358135..f4bc58257 100644 --- a/Tensile/Configs/rocblas_dgemm.yaml +++ b/Tensile/Configs/rocblas_dgemm.yaml @@ -41,8 +41,8 @@ BenchmarkProblems: - ThreadTileShape: [ 0 ] - WorkGroupEdge: [8, 16] - ThreadTileEdge: [2, 4, 6, 8] - - NumLoadsCoalescedA: [ 1, 2, 3, 4, 6, 8 ] - - NumLoadsCoalescedB: [ 1, 2, 3, 4, 6, 8 ] + - NumLoadsCoalescedA: [ 1, -1 ] + - NumLoadsCoalescedB: [ 1, -1 ] - LoopUnroll: [8, 16] BenchmarkForkParameters: JoinParameters: @@ -74,8 +74,8 @@ BenchmarkProblems: - WorkGroupShape: [ 0 ] - ThreadTileEdge: [2, 4, 6, 8] - ThreadTileShape: [ 0 ] - - NumLoadsCoalescedA: [ 1, 2, 3, 4, 6, 8 ] - - NumLoadsCoalescedB: [ 1, 2, 3, 4, 6, 8 ] + - NumLoadsCoalescedA: [ 1, -1 ] + - NumLoadsCoalescedB: [ 1, -1 ] - LoopUnroll: [8, 16] BenchmarkForkParameters: JoinParameters: @@ -107,8 +107,8 @@ BenchmarkProblems: - WorkGroupShape: [ 0 ] - ThreadTileEdge: [2, 4, 6, 8] - ThreadTileShape: [ 0, 1 ] - - NumLoadsCoalescedA: [ 1, 2, 3, 4, 6, 8 ] - - NumLoadsCoalescedB: [ 1, 2, 3, 4, 6, 8 ] + - NumLoadsCoalescedA: [ 1, -1 ] + - NumLoadsCoalescedB: [ 1, -1 ] - LoopUnroll: [8, 16] BenchmarkForkParameters: JoinParameters: @@ -140,8 +140,8 @@ BenchmarkProblems: - WorkGroupShape: [ 0 ] - ThreadTileEdge: [2, 4, 6, 8] - ThreadTileShape: [ 0 ] - - NumLoadsCoalescedA: [ 1, 2, 3, 4, 6, 8 ] - - NumLoadsCoalescedB: [ 1, 2, 3, 4, 6, 8 ] + - NumLoadsCoalescedA: [ 1, -1 ] + - NumLoadsCoalescedB: [ 1, -1 ] - LoopUnroll: [8, 16] BenchmarkForkParameters: JoinParameters: diff --git a/Tensile/Configs/rocblas_sgemm.yaml b/Tensile/Configs/rocblas_sgemm.yaml index 346bf1cb7..fedec0deb 100644 --- a/Tensile/Configs/rocblas_sgemm.yaml +++ b/Tensile/Configs/rocblas_sgemm.yaml @@ -41,8 +41,8 @@ BenchmarkProblems: - ThreadTileShape: [ 0 ] - WorkGroupEdge: [8, 16] - ThreadTileEdge: [2, 4, 6, 8] - - NumLoadsCoalescedA: [ 1, 2, 3, 4, 6, 8 ] - - NumLoadsCoalescedB: [ 1, 2, 3, 4, 6, 8 ] + - NumLoadsCoalescedA: [ 1, -1 ] + - NumLoadsCoalescedB: [ 1, -1 ] - LoopUnroll: [8, 16] BenchmarkForkParameters: JoinParameters: @@ -74,8 +74,8 @@ BenchmarkProblems: - WorkGroupShape: [ 0 ] - ThreadTileEdge: [2, 4, 6, 8] - ThreadTileShape: [ 0 ] - - NumLoadsCoalescedA: [ 1, 2, 3, 4, 6, 8 ] - - NumLoadsCoalescedB: [ 1, 2, 3, 4, 6, 8 ] + - NumLoadsCoalescedA: [ 1, -1 ] + - NumLoadsCoalescedB: [ 1, -1 ] - LoopUnroll: [8, 16] BenchmarkForkParameters: JoinParameters: @@ -107,8 +107,8 @@ BenchmarkProblems: - WorkGroupShape: [ 0 ] - ThreadTileEdge: [2, 4, 6, 8] - ThreadTileShape: [ 0, 1 ] - - NumLoadsCoalescedA: [ 1, 2, 3, 4, 6, 8 ] - - NumLoadsCoalescedB: [ 1, 2, 3, 4, 6, 8 ] + - NumLoadsCoalescedA: [ 1, -1 ] + - NumLoadsCoalescedB: [ 1, -1 ] - LoopUnroll: [8, 16] BenchmarkForkParameters: JoinParameters: @@ -140,8 +140,8 @@ BenchmarkProblems: - WorkGroupShape: [ 0 ] - ThreadTileEdge: [2, 4, 6, 8] - ThreadTileShape: [ 0 ] - - NumLoadsCoalescedA: [ 1, 2, 3, 4, 6, 8 ] - - NumLoadsCoalescedB: [ 1, 2, 3, 4, 6, 8 ] + - NumLoadsCoalescedA: [ 1, -1 ] + - NumLoadsCoalescedB: [ 1, -1 ] - LoopUnroll: [8, 16] BenchmarkForkParameters: JoinParameters: diff --git a/Tensile/Configs/rocblas_zgemm.yaml b/Tensile/Configs/rocblas_zgemm.yaml index badbd3dfb..64f0f6af9 100644 --- a/Tensile/Configs/rocblas_zgemm.yaml +++ b/Tensile/Configs/rocblas_zgemm.yaml @@ -41,8 +41,8 @@ BenchmarkProblems: - ThreadTileShape: [ 0 ] - WorkGroupEdge: [8, 16] - ThreadTileEdge: [2, 4, 6, 8] - - NumLoadsCoalescedA: [ 1, 2, 3, 4, 6, 8 ] - - NumLoadsCoalescedB: [ 1, 2, 3, 4, 6, 8 ] + - NumLoadsCoalescedA: [ 1, -1 ] + - NumLoadsCoalescedB: [ 1, -1 ] - LoopUnroll: [8, 16] BenchmarkForkParameters: JoinParameters: @@ -74,8 +74,8 @@ BenchmarkProblems: - WorkGroupShape: [ 0 ] - ThreadTileEdge: [2, 4, 6, 8] - ThreadTileShape: [ 0 ] - - NumLoadsCoalescedA: [ 1, 2, 3, 4, 6, 8 ] - - NumLoadsCoalescedB: [ 1, 2, 3, 4, 6, 8 ] + - NumLoadsCoalescedA: [ 1, -1 ] + - NumLoadsCoalescedB: [ 1, -1 ] - LoopUnroll: [8, 16] BenchmarkForkParameters: JoinParameters: @@ -107,8 +107,8 @@ BenchmarkProblems: - WorkGroupShape: [ 0 ] - ThreadTileEdge: [2, 4, 6, 8] - ThreadTileShape: [ 0 ] - - NumLoadsCoalescedA: [ 1, 2, 3, 4, 6, 8 ] - - NumLoadsCoalescedB: [ 1, 2, 3, 4, 6, 8 ] + - NumLoadsCoalescedA: [ 1, -1 ] + - NumLoadsCoalescedB: [ 1, -1 ] - LoopUnroll: [8, 16] BenchmarkForkParameters: JoinParameters: @@ -140,8 +140,8 @@ BenchmarkProblems: - WorkGroupShape: [ 0 ] - ThreadTileEdge: [2, 4, 6, 8] - ThreadTileShape: [ 0 ] - - NumLoadsCoalescedA: [ 1, 2, 3, 4, 6, 8 ] - - NumLoadsCoalescedB: [ 1, 2, 3, 4, 6, 8 ] + - NumLoadsCoalescedA: [ 1, -1 ] + - NumLoadsCoalescedB: [ 1, -1 ] - LoopUnroll: [8, 16] BenchmarkForkParameters: JoinParameters: diff --git a/Tensile/Configs/sgemm.yaml b/Tensile/Configs/sgemm.yaml index 49e6d13c7..7d2805e4a 100644 --- a/Tensile/Configs/sgemm.yaml +++ b/Tensile/Configs/sgemm.yaml @@ -40,13 +40,10 @@ BenchmarkProblems: ForkParameters: - WorkGroupEdge: [8, 16] - ThreadTileEdge: [2, 4, 8 ] - BenchmarkForkParameters: - - NumLoadsCoalescedA: [ 1, 2, 3, 4, 6 ] - - NumLoadsCoalescedB: [ 1, 2, 3, 4, 6 ] - LoopUnroll: [8, 16] + BenchmarkForkParameters: JoinParameters: - MacroTile - - DepthU BenchmarkJoinParameters: BenchmarkFinalParameters: - ProblemSizes: [ [16, 128], [16, 128], [256] ] diff --git a/Tensile/Configs/sgemm_5760.yaml b/Tensile/Configs/sgemm_5760.yaml index d8f077b85..7c167a650 100644 --- a/Tensile/Configs/sgemm_5760.yaml +++ b/Tensile/Configs/sgemm_5760.yaml @@ -39,8 +39,6 @@ BenchmarkProblems: - ThreadTileShape: [0, 1] - WorkGroupEdge: [8, 16] - ThreadTileEdge: [ 8 ] - - NumLoadsCoalescedA: [ 1, 2, 3, 4, 6, 8 ] - - NumLoadsCoalescedB: [ 1, 2, 3, 4, 6, 8 ] - LoopUnroll: [8, 16] #BenchmarkForkParameters: JoinParameters: @@ -71,8 +69,6 @@ BenchmarkProblems: - ThreadTileShape: [0, 1] - WorkGroupEdge: [8, 16] - ThreadTileEdge: [4, 6, 8] - - NumLoadsCoalescedA: [ 1, 2, 3, 4, 5, 6 ] - - NumLoadsCoalescedB: [ 1, 2, 3, 4, 5, 6 ] - LoopUnroll: [8, 16] #BenchmarkForkParameters: JoinParameters: @@ -103,8 +99,6 @@ BenchmarkProblems: - ThreadTileShape: [0, 1] - WorkGroupEdge: [8, 16] - ThreadTileEdge: [4, 6, 8] - - NumLoadsCoalescedA: [ 1, 2, 3, 4, 5, 6 ] - - NumLoadsCoalescedB: [ 1, 2, 3, 4, 5, 6 ] - LoopUnroll: [8, 16] #BenchmarkForkParameters: JoinParameters: @@ -135,8 +129,6 @@ BenchmarkProblems: - ThreadTileShape: [0, 1] - WorkGroupEdge: [8, 16] - ThreadTileEdge: [4, 6, 8] - - NumLoadsCoalescedA: [ 1, 2, 3, 4, 5, 6 ] - - NumLoadsCoalescedB: [ 1, 2, 3, 4, 5, 6 ] - LoopUnroll: [8, 16] #BenchmarkForkParameters: JoinParameters: diff --git a/Tensile/Configs/tensor_contraction.yaml b/Tensile/Configs/tensor_contraction.yaml index dd44124d7..8201d0bec 100644 --- a/Tensile/Configs/tensor_contraction.yaml +++ b/Tensile/Configs/tensor_contraction.yaml @@ -40,10 +40,8 @@ BenchmarkProblems: ForkParameters: - WorkGroupEdge: [8, 16] - ThreadTileEdge: [2, 4, 8 ] - BenchmarkForkParameters: - - NumLoadsCoalescedA: [ 1, 2, 3, 4, 6 ] - - NumLoadsCoalescedB: [ 1, 2, 3, 4, 6 ] - LoopUnroll: [8, 16] + BenchmarkForkParameters: JoinParameters: - MacroTile - DepthU diff --git a/Tensile/SolutionStructs.py b/Tensile/SolutionStructs.py index 2f6b9d407..c83d28fed 100644 --- a/Tensile/SolutionStructs.py +++ b/Tensile/SolutionStructs.py @@ -606,20 +606,20 @@ def assignDerivedParameters(state): # how many elements to load if state["ProblemType"]["TLUA"]: - totalElementsParaA = state["MacroTile0"] + totalElementsCoalescedA = state["MacroTile0"] totalElementsPerpA = state["LoopUnroll"] else: - totalElementsParaA = state["LoopUnroll"] + totalElementsCoalescedA = state["LoopUnroll"] totalElementsPerpA = state["MacroTile0"] if state["ProblemType"]["TLUB"]: - totalElementsParaB = state["MacroTile1"] + totalElementsCoalescedB = state["MacroTile1"] totalElementsPerpB = state["LoopUnroll"] else: - totalElementsParaB = state["LoopUnroll"] + totalElementsCoalescedB = state["LoopUnroll"] totalElementsPerpB = state["MacroTile1"] - totalElementsA = totalElementsParaA * totalElementsPerpA - totalElementsB = totalElementsParaB * totalElementsPerpB + totalElementsA = totalElementsCoalescedA * totalElementsPerpA + totalElementsB = totalElementsCoalescedB * totalElementsPerpB # how many load instructions if totalElementsA % state["NumThreads"] != 0: @@ -638,17 +638,122 @@ def assignDerivedParameters(state): else: state["NumLoadsB"] = totalElementsB / state["NumThreads"] - # how many loads para - if state["NumLoadsCoalescedA"] < 1: - state["NumLoadsCoalescedA"] = state["NumLoadsA"] - if state["NumLoadsA"] % state["NumLoadsCoalescedA"] != 0: - if printReason: print2("numLoadsA %u %% numLoadsParaA %u != 0" \ - % (state["NumLoadsA"], state["NumLoadsCoalescedA"])) - state["Valid"] = False - return + # nlca = 1 + if state["NumLoadsCoalescedA"] == 1: + foundValid = False + for nlca in range(1, state["NumLoadsA"]+1): + nlpa = state["NumLoadsA"] / nlca + if state["NumLoadsA"] % nlca == 0 \ + and totalElementsCoalescedA % nlca == 0 \ + and totalElementsPerpA % nlpa == 0: + state["NumLoadsCoalescedA"] = nlca + state["NumLoadsPerpendicularA"] = nlpa + foundValid = True + break + if not foundValid: + state["Valid"] = False + return + + # nlca = -1 + elif state["NumLoadsCoalescedA"] == -1: + foundValid = False + for nlca in range(state["NumLoadsA"], 0, -1): + nlpa = state["NumLoadsA"] / nlca + if state["NumLoadsA"] % nlca == 0 \ + and totalElementsCoalescedA % nlca == 0 \ + and totalElementsPerpA % nlpa == 0: + state["NumLoadsCoalescedA"] = nlca + state["NumLoadsPerpendicularA"] = nlpa + foundValid = True + break + if not foundValid: + state["Valid"] = False + return + + # nlca = other else: state["NumLoadsPerpendicularA"] = state["NumLoadsA"] \ / state["NumLoadsCoalescedA"] + + if state["NumLoadsA"] % state["NumLoadsCoalescedA"] != 0: + if printReason: print2("numLoadsA %u %% numLoadsParaA %u != 0" \ + % (state["NumLoadsA"], state["NumLoadsCoalescedA"])) + state["Valid"] = False + if totalElementsCoalescedA % state["NumLoadsCoalescedA"] != 0: + if printReason: print2("totalElementsCoalescedA %u %% numLoadsParaA %u != 0" \ + % (totalElementsCoalescedA, state["NumLoadsCoalescedA"])) + state["Valid"] = False + return + if totalElementsPerpA % state["NumLoadsPerpendicularA"] != 0: + if printReason: print2("totalElementsPerpA %u %% numLoadsPerpA %u != 0" \ + % (totalElementsPerpA, state["NumLoadsPerpendicularA"])) + state["Valid"] = False + return + + + + + + # nlcb = 1 + if state["NumLoadsCoalescedB"] == 1: + foundValid = False + for nlca in range(1, state["NumLoadsB"]+1): + nlpa = state["NumLoadsB"] / nlca + if state["NumLoadsB"] % nlca == 0 \ + and totalElementsCoalescedB % nlca == 0 \ + and totalElementsPerpB % nlpa == 0: + state["NumLoadsCoalescedB"] = nlca + state["NumLoadsPerpendicularB"] = nlpa + foundValid = True + break + if not foundValid: + state["Valid"] = False + return + + # nlcb = -1 + elif state["NumLoadsCoalescedB"] == -1: + foundValid = False + for nlca in range(state["NumLoadsB"], 0, -1): + nlpa = state["NumLoadsB"] / nlca + if state["NumLoadsB"] % nlca == 0 \ + and totalElementsCoalescedB % nlca == 0 \ + and totalElementsPerpB % nlpa == 0: + state["NumLoadsCoalescedB"] = nlca + state["NumLoadsPerpendicularB"] = nlpa + foundValid = True + break + if not foundValid: + state["Valid"] = False + return + + # nlcb = other + else: + state["NumLoadsPerpendicularB"] = state["NumLoadsB"] \ + / state["NumLoadsCoalescedB"] + + if state["NumLoadsB"] % state["NumLoadsCoalescedB"] != 0: + if printReason: print2("numLoadsB %u %% numLoadsParaB %u != 0" \ + % (state["NumLoadsB"], state["NumLoadsCoalescedB"])) + state["Valid"] = False + if totalElementsCoalescedB % state["NumLoadsCoalescedB"] != 0: + if printReason: print2("totalElementsCoalescedB %u %% numLoadsParaB %u != 0" \ + % (totalElementsCoalescedB, state["NumLoadsCoalescedB"])) + state["Valid"] = False + return + if totalElementsPerpB % state["NumLoadsPerpendicularB"] != 0: + if printReason: print2("totalElementsPerpB %u %% numLoadsPerpB %u != 0" \ + % (totalElementsPerpB, state["NumLoadsPerpendicularB"])) + state["Valid"] = False + return + + + + + + + + + """ if state["NumLoadsCoalescedB"] < 1: state["NumLoadsCoalescedB"] = state["NumLoadsB"] if state["NumLoadsB"] % state["NumLoadsCoalescedB"] != 0: @@ -660,30 +765,15 @@ def assignDerivedParameters(state): state["NumLoadsPerpendicularB"] = state["NumLoadsB"] \ / state["NumLoadsCoalescedB"] - # load size para/perp A - if totalElementsParaA % state["NumLoadsCoalescedA"] != 0: - if printReason: print2("totalElementsParaA %u %% numLoadsParaA %u != 0" \ - % (totalElementsParaA, state["NumLoadsCoalescedA"])) - state["Valid"] = False - return - #else: - # loadSizeParaA = totalElementsParaA / state["NumLoadsCoalescedA"] - if totalElementsPerpA % state["NumLoadsPerpendicularA"] != 0: - if printReason: print2("totalElementsPerpA %u %% numLoadsPerpA %u != 0" \ - % (totalElementsPerpA, state["NumLoadsPerpendicularA"])) - state["Valid"] = False - return - #else: - # loadSizePerpA = totalElementsPerpA / state["NumLoadsPerpendicularA"] # load size para/perp B - if totalElementsParaB % state["NumLoadsCoalescedB"] != 0: - if printReason: print2("totalElementsParaB %u %% numLoadsParaB %u != 0" \ - % (totalElementsParaB, state["NumLoadsCoalescedB"])) + if totalElementsCoalescedB % state["NumLoadsCoalescedB"] != 0: + if printReason: print2("totalElementsCoalescedB %u %% numLoadsParaB %u != 0" \ + % (totalElementsCoalescedB, state["NumLoadsCoalescedB"])) state["Valid"] = False return #else: - # loadSizeParaB = totalElementsParaB / state["NumLoadsCoalescedB"] + # loadSizeParaB = totalElementsCoalescedB / state["NumLoadsCoalescedB"] if totalElementsPerpB % state["NumLoadsPerpendicularB"] != 0: if printReason: print2("totalElementsPerpB %u %% numLoadsPerpB %u != 0" \ % (totalElementsPerpB, state["NumLoadsPerpendicularB"])) @@ -691,6 +781,15 @@ def assignDerivedParameters(state): return #else: # loadSizePerpB = totalElementsPerpB / state["NumLoadsPerpendicularB"] + """ + + + + + + + + # too much LDS sizeLDS = state["LoopUnroll"] \ diff --git a/Tensile/__init__.py b/Tensile/__init__.py index e835b9d02..df4be5e09 100644 --- a/Tensile/__init__.py +++ b/Tensile/__init__.py @@ -1 +1 @@ -__version__ = "2.1.3" +__version__ = "2.1.4" From 250bfb0c4122bc0d38faa27e1ef728fc373c0242 Mon Sep 17 00:00:00 2001 From: David Tanner Date: Fri, 24 Feb 2017 14:30:02 -0600 Subject: [PATCH 06/21] faster addition of MANY solutions --- Tensile/BenchmarkProblems.py | 15 ++++++--------- 1 file changed, 6 insertions(+), 9 deletions(-) diff --git a/Tensile/BenchmarkProblems.py b/Tensile/BenchmarkProblems.py index 178acf213..62d9867b4 100644 --- a/Tensile/BenchmarkProblems.py +++ b/Tensile/BenchmarkProblems.py @@ -148,6 +148,7 @@ def benchmarkProblemType( config ): # Enumerate Solutions = Hardcoded * Benchmark ############################################################################ sys.stdout.write("# Enumerating Solutions") + solutionSet = set() # avoid duplicates for nlca=-1, 1 for hardcodedIdx in range(0, numHardcoded): solutions.append([]) hardcodedParamDict = benchmarkStep.hardcodedParameters[hardcodedIdx] @@ -170,18 +171,14 @@ def benchmarkProblemType( config ): # TODO check if solution matches problem size for exact tile kernels solutionObject = Solution(solution) if solutionObject["Valid"]: - hasSolution = False - for hardcodedSolutions in solutions: - for hardcodedSolution in hardcodedSolutions: - if hardcodedSolution == solutionObject: - hasSolution = True - if hasSolution: - if globalParameters["PrintLevel"] >= 1: - sys.stdout.write(":") - else: + if solutionObject not in solutionSet: + solutionSet.add(solutionObject) solutions[hardcodedIdx].append(solutionObject) if globalParameters["PrintLevel"] >= 1: sys.stdout.write("|") + else: + if globalParameters["PrintLevel"] >= 1: + sys.stdout.write(":") else: if globalParameters["PrintLevel"] >= 1: sys.stdout.write(".") From 9db107768f1ee532d2e7530c5863c27797944e66 Mon Sep 17 00:00:00 2001 From: David Tanner Date: Fri, 24 Feb 2017 15:15:52 -0600 Subject: [PATCH 07/21] fixed definition of workgroup and threadtile shapes, able to make even skinnier --- Tensile/BenchmarkStructs.py | 8 ++++---- Tensile/SolutionStructs.py | 16 ++++++++-------- Tensile/__init__.py | 2 +- 3 files changed, 13 insertions(+), 13 deletions(-) diff --git a/Tensile/BenchmarkStructs.py b/Tensile/BenchmarkStructs.py index b119001ae..236f67eb9 100644 --- a/Tensile/BenchmarkStructs.py +++ b/Tensile/BenchmarkStructs.py @@ -399,13 +399,13 @@ def convertParametersToSteps(self): macroTileDim0 = workGroupEdgeValues[workGroupEdgeIdx]*threadTileEdgeValues[threadTileEdgeIdx] macroTileDim1 = macroTileDim0 if workGroupShapeValues[workGroupShapeIdx] < 0: - macroTileDim1 /= 2 + macroTileDim0 *= abs(workGroupShapeValues[workGroupShapeIdx]) elif workGroupShapeValues[workGroupShapeIdx] > 0: - macroTileDim1 *= 2 + macroTileDim1 *= abs(workGroupShapeValues[workGroupShapeIdx]) if threadTileShapeValues[threadTileShapeIdx] < 0: - macroTileDim1 /= 2 + macroTileDim0 *= abs(threadTileShapeValues[threadTileShapeIdx]) elif threadTileShapeValues[threadTileShapeIdx] > 0: - macroTileDim1 *= 2 + macroTileDim1 *= abs(threadTileShapeValues[threadTileShapeIdx]) if macroTileDim0/macroTileDim1 <= self.initialSolutionParameters["MacroTileMaxRatio"] and macroTileDim1/macroTileDim0 <= self.initialSolutionParameters["MacroTileMaxRatio"]: macroTileJoinSet.add((macroTileDim0, macroTileDim1)) totalPermutations *=len(macroTileJoinSet) diff --git a/Tensile/SolutionStructs.py b/Tensile/SolutionStructs.py index c83d28fed..eeeef16af 100644 --- a/Tensile/SolutionStructs.py +++ b/Tensile/SolutionStructs.py @@ -562,18 +562,18 @@ def assignProblemIndependentDerivedParameters(state): # workgroup sizes state["WorkGroup0"] = state["WorkGroupEdge"] state["WorkGroup1"] = state["WorkGroupEdge"] - if state["WorkGroupShape"] == 1: - state["WorkGroup1"] *= 2 - elif state["WorkGroupShape"] == -1: - state["WorkGroup0"] *= 2 + if state["WorkGroupShape"] > 0: + state["WorkGroup1"] *= abs(state["WorkGroupShape"]) + elif state["WorkGroupShape"] < 0: + state["WorkGroup0"] *= abs(state["WorkGroupShape"]) # thread tile sizes state["ThreadTile0"] = state["ThreadTileEdge"] state["ThreadTile1"] = state["ThreadTileEdge"] - if state["ThreadTileShape"] == 1: - state["ThreadTile1"] *= 2 - elif state["ThreadTileShape"] == -1: - state["ThreadTile0"] *= 2 + if state["ThreadTileShape"] > 0: + state["ThreadTile1"] *= abs(state["ThreadTileShape"]) + elif state["ThreadTileShape"] < 0: + state["ThreadTile0"] *= abs(state["ThreadTileShape"]) # macro tile sizes if "WorkGroup0" in state and "ThreadTile0" in state: diff --git a/Tensile/__init__.py b/Tensile/__init__.py index df4be5e09..0b167e619 100644 --- a/Tensile/__init__.py +++ b/Tensile/__init__.py @@ -1 +1 @@ -__version__ = "2.1.4" +__version__ = "2.1.5" From 83da8dbf7a460c250add4da285d6f67cfa9cd6fc Mon Sep 17 00:00:00 2001 From: David Tanner Date: Fri, 24 Feb 2017 16:11:24 -0600 Subject: [PATCH 08/21] fixed shape definition --- Tensile/BenchmarkStructs.py | 8 ++++---- Tensile/SolutionStructs.py | 8 ++++---- 2 files changed, 8 insertions(+), 8 deletions(-) diff --git a/Tensile/BenchmarkStructs.py b/Tensile/BenchmarkStructs.py index 236f67eb9..86742be6c 100644 --- a/Tensile/BenchmarkStructs.py +++ b/Tensile/BenchmarkStructs.py @@ -399,13 +399,13 @@ def convertParametersToSteps(self): macroTileDim0 = workGroupEdgeValues[workGroupEdgeIdx]*threadTileEdgeValues[threadTileEdgeIdx] macroTileDim1 = macroTileDim0 if workGroupShapeValues[workGroupShapeIdx] < 0: - macroTileDim0 *= abs(workGroupShapeValues[workGroupShapeIdx]) + macroTileDim0 *= 2*abs(workGroupShapeValues[workGroupShapeIdx]) elif workGroupShapeValues[workGroupShapeIdx] > 0: - macroTileDim1 *= abs(workGroupShapeValues[workGroupShapeIdx]) + macroTileDim1 *= 2*abs(workGroupShapeValues[workGroupShapeIdx]) if threadTileShapeValues[threadTileShapeIdx] < 0: - macroTileDim0 *= abs(threadTileShapeValues[threadTileShapeIdx]) + macroTileDim0 *= 2*abs(threadTileShapeValues[threadTileShapeIdx]) elif threadTileShapeValues[threadTileShapeIdx] > 0: - macroTileDim1 *= abs(threadTileShapeValues[threadTileShapeIdx]) + macroTileDim1 *= 2*abs(threadTileShapeValues[threadTileShapeIdx]) if macroTileDim0/macroTileDim1 <= self.initialSolutionParameters["MacroTileMaxRatio"] and macroTileDim1/macroTileDim0 <= self.initialSolutionParameters["MacroTileMaxRatio"]: macroTileJoinSet.add((macroTileDim0, macroTileDim1)) totalPermutations *=len(macroTileJoinSet) diff --git a/Tensile/SolutionStructs.py b/Tensile/SolutionStructs.py index eeeef16af..bcaa80acd 100644 --- a/Tensile/SolutionStructs.py +++ b/Tensile/SolutionStructs.py @@ -563,17 +563,17 @@ def assignProblemIndependentDerivedParameters(state): state["WorkGroup0"] = state["WorkGroupEdge"] state["WorkGroup1"] = state["WorkGroupEdge"] if state["WorkGroupShape"] > 0: - state["WorkGroup1"] *= abs(state["WorkGroupShape"]) + state["WorkGroup1"] *= 2*abs(state["WorkGroupShape"]) elif state["WorkGroupShape"] < 0: - state["WorkGroup0"] *= abs(state["WorkGroupShape"]) + state["WorkGroup0"] *= 2*abs(state["WorkGroupShape"]) # thread tile sizes state["ThreadTile0"] = state["ThreadTileEdge"] state["ThreadTile1"] = state["ThreadTileEdge"] if state["ThreadTileShape"] > 0: - state["ThreadTile1"] *= abs(state["ThreadTileShape"]) + state["ThreadTile1"] *= 2*abs(state["ThreadTileShape"]) elif state["ThreadTileShape"] < 0: - state["ThreadTile0"] *= abs(state["ThreadTileShape"]) + state["ThreadTile0"] *= 2*abs(state["ThreadTileShape"]) # macro tile sizes if "WorkGroup0" in state and "ThreadTile0" in state: From e87d18dbc0ffc52b10a1ce94750e29833d1eb0ec Mon Sep 17 00:00:00 2001 From: David Tanner Date: Fri, 24 Feb 2017 16:29:00 -0600 Subject: [PATCH 09/21] min threads --- Tensile/BenchmarkStructs.py | 8 +-- Tensile/Common.py | 5 +- Tensile/SolutionStructs.py | 11 ++-- Tensile/SolutionWriter.py | 124 ------------------------------------ 4 files changed, 14 insertions(+), 134 deletions(-) diff --git a/Tensile/BenchmarkStructs.py b/Tensile/BenchmarkStructs.py index 86742be6c..236f67eb9 100644 --- a/Tensile/BenchmarkStructs.py +++ b/Tensile/BenchmarkStructs.py @@ -399,13 +399,13 @@ def convertParametersToSteps(self): macroTileDim0 = workGroupEdgeValues[workGroupEdgeIdx]*threadTileEdgeValues[threadTileEdgeIdx] macroTileDim1 = macroTileDim0 if workGroupShapeValues[workGroupShapeIdx] < 0: - macroTileDim0 *= 2*abs(workGroupShapeValues[workGroupShapeIdx]) + macroTileDim0 *= abs(workGroupShapeValues[workGroupShapeIdx]) elif workGroupShapeValues[workGroupShapeIdx] > 0: - macroTileDim1 *= 2*abs(workGroupShapeValues[workGroupShapeIdx]) + macroTileDim1 *= abs(workGroupShapeValues[workGroupShapeIdx]) if threadTileShapeValues[threadTileShapeIdx] < 0: - macroTileDim0 *= 2*abs(threadTileShapeValues[threadTileShapeIdx]) + macroTileDim0 *= abs(threadTileShapeValues[threadTileShapeIdx]) elif threadTileShapeValues[threadTileShapeIdx] > 0: - macroTileDim1 *= 2*abs(threadTileShapeValues[threadTileShapeIdx]) + macroTileDim1 *= abs(threadTileShapeValues[threadTileShapeIdx]) if macroTileDim0/macroTileDim1 <= self.initialSolutionParameters["MacroTileMaxRatio"] and macroTileDim1/macroTileDim0 <= self.initialSolutionParameters["MacroTileMaxRatio"]: macroTileJoinSet.add((macroTileDim0, macroTileDim1)) totalPermutations *=len(macroTileJoinSet) diff --git a/Tensile/Common.py b/Tensile/Common.py index 2cf0e0906..08e85b586 100644 --- a/Tensile/Common.py +++ b/Tensile/Common.py @@ -49,6 +49,7 @@ globalParameters["DataInitType"] = 0 # 0=rand, 1=1, 2=serial # protect against invalid kernel globalParameters["MaxThreads"] = 256 +globalParameters["MinThreads"] = 64 globalParameters["MaxRegisters"] = 256 globalParameters["MaxLDS"] = 32768 @@ -73,9 +74,9 @@ # benchmark these solution independently defaultForkParameters = [ {"WorkGroupEdge": [ 16, 8 ] }, - {"WorkGroupShape": [ 0 ] }, # -1, 0, 1 + {"WorkGroupShape": [ 0 ] }, # -4, -2, 0, 2, 4 {"ThreadTileEdge": [ 1, 2, 4, 6, 8 ] }, - {"ThreadTileShape": [ 0 ] }, # -1, 0, 1 + {"ThreadTileShape": [ 0 ] }, # -4, -2, 0, 2, 4 {"NumLoadsCoalescedA": [ 1, -1 ] }, {"NumLoadsCoalescedB": [ 1, -1 ] }, ] diff --git a/Tensile/SolutionStructs.py b/Tensile/SolutionStructs.py index bcaa80acd..e4a853804 100644 --- a/Tensile/SolutionStructs.py +++ b/Tensile/SolutionStructs.py @@ -563,17 +563,17 @@ def assignProblemIndependentDerivedParameters(state): state["WorkGroup0"] = state["WorkGroupEdge"] state["WorkGroup1"] = state["WorkGroupEdge"] if state["WorkGroupShape"] > 0: - state["WorkGroup1"] *= 2*abs(state["WorkGroupShape"]) + state["WorkGroup1"] *= abs(state["WorkGroupShape"]) elif state["WorkGroupShape"] < 0: - state["WorkGroup0"] *= 2*abs(state["WorkGroupShape"]) + state["WorkGroup0"] *= abs(state["WorkGroupShape"]) # thread tile sizes state["ThreadTile0"] = state["ThreadTileEdge"] state["ThreadTile1"] = state["ThreadTileEdge"] if state["ThreadTileShape"] > 0: - state["ThreadTile1"] *= 2*abs(state["ThreadTileShape"]) + state["ThreadTile1"] *= abs(state["ThreadTileShape"]) elif state["ThreadTileShape"] < 0: - state["ThreadTile0"] *= 2*abs(state["ThreadTileShape"]) + state["ThreadTile0"] *= abs(state["ThreadTileShape"]) # macro tile sizes if "WorkGroup0" in state and "ThreadTile0" in state: @@ -589,6 +589,9 @@ def assignProblemIndependentDerivedParameters(state): if state["NumThreads"] > globalParameters["MaxThreads"]: if printReason: print2("rejecting %u threads" % state["NumThreads"]) state["Valid"] = False + if state["NumThreads"] < globalParameters["MinThreads"]: + if printReason: print2("rejecting %u threads" % state["NumThreads"]) + state["Valid"] = False state["AssignedProblemIndependentDerivedParameters"] = True ######################################## diff --git a/Tensile/SolutionWriter.py b/Tensile/SolutionWriter.py index 452c9c45e..ea7585b57 100644 --- a/Tensile/SolutionWriter.py +++ b/Tensile/SolutionWriter.py @@ -787,127 +787,3 @@ def getHeaderFileString(self, solution): return fileStr - ############################################################################## - # are solution parameters (dict) self-consistent - ############################################################################## - @ staticmethod - def solutionParametersConsistent(solution): - printReason = False - - numThreads = solution["WorkGroup0"]*solution["WorkGroup1"] - if numThreads > globalParameters["MaxThreads"]: - if printReason: print2("rejecting %u threads" % numThreads) - return False - - # how many elements to load - if solution["ProblemType"]["TLUA"]: - totalElementsParaA = solution["MacroTile0"] - totalElementsPerpA = solution["LoopUnroll"] - else: - totalElementsParaA = solution["LoopUnroll"] - totalElementsPerpA = solution["MacroTile0"] - - if solution["ProblemType"]["TLUB"]: - totalElementsParaB = solution["MacroTile1"] - totalElementsPerpB = solution["LoopUnroll"] - else: - totalElementsParaB = solution["LoopUnroll"] - totalElementsPerpB = solution["MacroTile1"] - totalElementsA = totalElementsParaA * totalElementsPerpA - totalElementsB = totalElementsParaB * totalElementsPerpB - - # how many load instructions - if totalElementsA % numThreads != 0: - if printReason: print2("totalElementsA %u %% numThreads %u != 0" \ - % (totalElementsA, numThreads)) - return False - else: - solution["NumLoadsA"] = totalElementsA / numThreads - if totalElementsB % numThreads != 0: - if printReason: print2("totalElementsB %u %% numThreads %u != 0" \ - % (totalElementsB, numThreads)) - return False - else: - solution["NumLoadsB"] = totalElementsB / numThreads - - # how many loads para - if solution["NumLoadsCoalescedA"] < 1: - solution["NumLoadsCoalescedA"] = solution["NumLoadsA"] - if solution["NumLoadsA"] % solution["NumLoadsCoalescedA"] != 0: - if printReason: print2("numLoadsA %u %% numLoadsParaA %u != 0" \ - % (solution["NumLoadsA"], solution["NumLoadsCoalescedA"])) - return False - else: - solution["NumLoadsPerpendicularA"] = solution["NumLoadsA"] \ - / solution["NumLoadsCoalescedA"] - if solution["NumLoadsCoalescedB"] < 1: - solution["NumLoadsCoalescedB"] = solution["NumLoadsB"] - if solution["NumLoadsB"] % solution["NumLoadsCoalescedB"] != 0: - if printReason: print2("numLoadsB %u %% numLoadsParaB %u != 0" \ - % (solution["NumLoadsB"], solution["NumLoadsCoalescedB"])) - return False - else: - solution["NumLoadsPerpendicularB"] = solution["NumLoadsB"] \ - / solution["NumLoadsCoalescedB"] - - # load size para/perp A - if totalElementsParaA % solution["NumLoadsCoalescedA"] != 0: - if printReason: print2("totalElementsParaA %u %% numLoadsParaA %u != 0" \ - % (totalElementsParaA, solution["NumLoadsCoalescedA"])) - return False - #else: - # loadSizeParaA = totalElementsParaA / solution["NumLoadsCoalescedA"] - if totalElementsPerpA % solution["NumLoadsPerpendicularA"] != 0: - if printReason: print2("totalElementsPerpA %u %% numLoadsPerpA %u != 0" \ - % (totalElementsPerpA, solution["NumLoadsPerpendicularA"])) - return False - #else: - # loadSizePerpA = totalElementsPerpA / solution["NumLoadsPerpendicularA"] - - # load size para/perp B - if totalElementsParaB % solution["NumLoadsCoalescedB"] != 0: - if printReason: print2("totalElementsParaB %u %% numLoadsParaB %u != 0" \ - % (totalElementsParaB, solution["NumLoadsCoalescedB"])) - return False - #else: - # loadSizeParaB = totalElementsParaB / solution["NumLoadsCoalescedB"] - if totalElementsPerpB % solution["NumLoadsPerpendicularB"] != 0: - if printReason: print2("totalElementsPerpB %u %% numLoadsPerpB %u != 0" \ - % (totalElementsPerpB, solution["NumLoadsPerpendicularB"])) - return False - #else: - # loadSizePerpB = totalElementsPerpB / solution["NumLoadsPerpendicularB"] - - # too much LDS - sizeLDS = solution["LoopUnroll"] \ - * (solution["PadLDS"] * 2 + solution["MacroTile0"] \ - + solution["MacroTile1"] ) \ - * solution["ProblemType"]["DataType"].numBytes() - if sizeLDS > globalParameters["MaxLDS"]: - if printReason: print2("Kernel Uses %u > %u bytes" % ( sizeLDS, globalParameters["MaxLDS"])) - return False - - # Compiler may be causing incorrect spills on ROCm1.4 from DT on 2/21/17 - if globalParameters["Backend"] == "HIP": - if solution["ProblemType"]["DataType"].value == DataType.single: - if solution["MacroTile0"] == 128 or solution["MacroTile1"] == 128: - if solution["NumLoadsCoalescedA"] != 1 and solution["NumLoadsCoalescedB"] != 8: - return False - elif solution["ProblemType"]["DataType"].value == DataType.double: - if globalParameters["Backend"] == "HIP": - if solution["MacroTile0"] >= 64 or solution["MacroTile1"] >= 64: - return False -# validation failures -# Cijk_Ailk_Bjlk_SB_DU16_LU16_MT064_MT164_NLA16_NLB16_NLCA02_NLCB01_NLPA08_NLPB16_TT008_TT108_TTE08_WG008_WG108_WGE08 -# Cijk_Ailk_Bjlk_SB_DU16_LU16_MT064_MT164_NLA16_NLB16_NLCA04_NLCB02_NLPA04_NLPB08_TT008_TT108_TTE08_WG008_WG108_WGE08 -# Cijk_Ailk_Bjlk_SB_DU16_LU16_MT064_MT164_NLA16_NLB16_NLCA02_NLCB04_NLPA08_NLPB04_TT008_TT108_TTE08_WG008_WG108_WGE08 - -# Cijk_Ailk_Bjlk_DB_DU16_LU16_MT064_MT164_NLA16_NLB16_NLCA04_NLCB01_NLPA04_NLPB16_TT008_TT108_TTE08_WG008_WG108_WGE08 -# Cijk_Ailk_Bjlk_DB_DU08_LU08_MT064_MT164_NLA08_NLB08_NLCA01_NLCB01_NLPA08_NLPB08_TT008_TT108_TTE08_WG008_WG108_WGE08 -# Cijk_Ailk_Bjlk_DB_DU08_LU08_MT064_MT164_NLA08_NLB08_NLCA08_NLCB01_NLPA01_NLPB08_TT008_TT108_TTE08_WG008_WG108_WGE08 -# Cijk_Ailk_Bjlk_DB_DU08_LU08_MT064_MT164_NLA08_NLB08_NLCA08_NLCB08_NLPA01_NLPB01_TT008_TT108_TTE08_WG008_WG108_WGE08 -# Cijk_Ailk_Bjlk_DB_DU16_LU16_MT064_MT164_NLA16_NLB16_NLCA08_NLCB08_NLPA02_NLPB02_TT008_TT108_TTE08_WG008_WG108_WGE08 -# Cijk_Ailk_Bjlk_DB_DU08_LU08_MT064_MT164_NLA08_NLB08_NLCA01_NLCB08_NLPA08_NLPB01_TT008_TT108_TTE08_WG008_WG108_WGE08 - - return True - From 851416b3844177aeced026d7a17bb4a6fc526e5c Mon Sep 17 00:00:00 2001 From: David Tanner Date: Fri, 24 Feb 2017 16:52:43 -0600 Subject: [PATCH 10/21] enabling max thread tile to support skinny --- Tensile/BenchmarkStructs.py | 6 ++++-- Tensile/Common.py | 5 +++-- Tensile/SolutionStructs.py | 8 ++++++++ Tensile/__init__.py | 2 +- 4 files changed, 16 insertions(+), 5 deletions(-) diff --git a/Tensile/BenchmarkStructs.py b/Tensile/BenchmarkStructs.py index 236f67eb9..48bbf9813 100644 --- a/Tensile/BenchmarkStructs.py +++ b/Tensile/BenchmarkStructs.py @@ -2,7 +2,7 @@ # from copy import * from copy import copy, deepcopy -from Common import print1, print2, printWarning, defaultSolution, defaultProblemSizes, defaultBenchmarkFinalProblemSizes, defaultBenchmarkCommonParameters, hasParam, defaultBenchmarkJoinParameters, getParamValues, defaultForkParameters, defaultBenchmarkForkParameters, defaultJoinParameters, printExit +from Common import print1, print2, printWarning, defaultSolution, defaultProblemSizes, defaultBenchmarkFinalProblemSizes, defaultBenchmarkCommonParameters, hasParam, defaultBenchmarkJoinParameters, getParamValues, defaultForkParameters, defaultBenchmarkForkParameters, defaultJoinParameters, printExit, globalParameters from SolutionStructs import Solution, ProblemType, ProblemSizes ################################################################################ @@ -406,7 +406,9 @@ def convertParametersToSteps(self): macroTileDim0 *= abs(threadTileShapeValues[threadTileShapeIdx]) elif threadTileShapeValues[threadTileShapeIdx] > 0: macroTileDim1 *= abs(threadTileShapeValues[threadTileShapeIdx]) - if macroTileDim0/macroTileDim1 <= self.initialSolutionParameters["MacroTileMaxRatio"] and macroTileDim1/macroTileDim0 <= self.initialSolutionParameters["MacroTileMaxRatio"]: + # TODO is this still useful? + if macroTileDim0/macroTileDim1 <= globalParameters["MaxMacroTileRatio"] \ + and macroTileDim1/macroTileDim0 <= globalParameters["MaxMacroTileRatio"]: macroTileJoinSet.add((macroTileDim0, macroTileDim1)) totalPermutations *=len(macroTileJoinSet) print2("JoinMacroTileSet(%u): %s" % (len(macroTileJoinSet), macroTileJoinSet) ) diff --git a/Tensile/Common.py b/Tensile/Common.py index 08e85b586..ef2b56533 100644 --- a/Tensile/Common.py +++ b/Tensile/Common.py @@ -50,8 +50,9 @@ # protect against invalid kernel globalParameters["MaxThreads"] = 256 globalParameters["MinThreads"] = 64 -globalParameters["MaxRegisters"] = 256 globalParameters["MaxLDS"] = 32768 +globalParameters["MaxMacroTileRatio"] = 4 +globalParameters["MaxThreadTile"] = 64 ################################################################################ @@ -101,6 +102,7 @@ derivedParameters = [ "MacroTile0", "MacroTile1", + "DepthU", "WorkGroup0", "WorkGroup1", "ThreadTile0", @@ -120,7 +122,6 @@ for key, value in paramDict.iteritems(): defaultSolution[key] = value[0] # other non-benchmark options for solutions -defaultSolution["MacroTileMaxRatio"] = 2 ################################################################################ # Default Problem Type diff --git a/Tensile/SolutionStructs.py b/Tensile/SolutionStructs.py index e4a853804..7eb18fa25 100644 --- a/Tensile/SolutionStructs.py +++ b/Tensile/SolutionStructs.py @@ -592,6 +592,14 @@ def assignProblemIndependentDerivedParameters(state): if state["NumThreads"] < globalParameters["MinThreads"]: if printReason: print2("rejecting %u threads" % state["NumThreads"]) state["Valid"] = False + + if state["MacroTile0"]/state["MacroTile1"] > globalParameters["MaxMacroTileRatio"] \ + or state["MacroTile1"]/state["MacroTile0"] > globalParameters["MaxMacroTileRatio"] \ + or state["ThreadTile0"]*state["ThreadTile1"]*state["ProblemType"]["DataType"].numRegisters() > globalParameters["MaxThreadTile"]: + state["Valid"] = False + + + state["AssignedProblemIndependentDerivedParameters"] = True ######################################## diff --git a/Tensile/__init__.py b/Tensile/__init__.py index 0b167e619..edc60b350 100644 --- a/Tensile/__init__.py +++ b/Tensile/__init__.py @@ -1 +1 @@ -__version__ = "2.1.5" +__version__ = "2.1.6" From 0b7a7b90ff2a852f7f4ef8a2b56a92814b8ae865 Mon Sep 17 00:00:00 2001 From: David Tanner Date: Fri, 24 Feb 2017 17:07:05 -0600 Subject: [PATCH 11/21] moving tile shape to problem-size dependent b/c numRegisters --- Tensile/SolutionStructs.py | 12 ++++++++---- 1 file changed, 8 insertions(+), 4 deletions(-) diff --git a/Tensile/SolutionStructs.py b/Tensile/SolutionStructs.py index 7eb18fa25..7dc27a644 100644 --- a/Tensile/SolutionStructs.py +++ b/Tensile/SolutionStructs.py @@ -593,15 +593,15 @@ def assignProblemIndependentDerivedParameters(state): if printReason: print2("rejecting %u threads" % state["NumThreads"]) state["Valid"] = False + # tile shape if state["MacroTile0"]/state["MacroTile1"] > globalParameters["MaxMacroTileRatio"] \ - or state["MacroTile1"]/state["MacroTile0"] > globalParameters["MaxMacroTileRatio"] \ - or state["ThreadTile0"]*state["ThreadTile1"]*state["ProblemType"]["DataType"].numRegisters() > globalParameters["MaxThreadTile"]: + or state["MacroTile1"]/state["MacroTile0"] > globalParameters["MaxMacroTileRatio"]: state["Valid"] = False - - + # done state["AssignedProblemIndependentDerivedParameters"] = True + ######################################## # assign all derived parameters @staticmethod @@ -615,6 +615,10 @@ def assignDerivedParameters(state): ProblemType.assignDerivedParameters(state["ProblemType"]) printReason = False + # tile size + if state["ThreadTile0"]*state["ThreadTile1"]*state["ProblemType"]["DataType"].numRegisters() > globalParameters["MaxThreadTile"]: + state["Valid"] = False + # how many elements to load if state["ProblemType"]["TLUA"]: totalElementsCoalescedA = state["MacroTile0"] From c610aefdc8f50280d9a6eec21a60c41786c17e3d Mon Sep 17 00:00:00 2001 From: David Tanner Date: Wed, 1 Mar 2017 11:15:27 -0600 Subject: [PATCH 12/21] revising logic to handle global problems --- Tensile/Common.py | 5 + Tensile/LibraryLogic.py | 1098 +++++++++++++++++++++++++++++++----- Tensile/SolutionStructs.py | 2 + 3 files changed, 954 insertions(+), 151 deletions(-) diff --git a/Tensile/Common.py b/Tensile/Common.py index ef2b56533..557690f79 100644 --- a/Tensile/Common.py +++ b/Tensile/Common.py @@ -154,6 +154,11 @@ defaultAnalysisParameters = { "Dilation": 3, "Threshold": 0.1, + "OutlierThreshold": 0, + "FractionTimeSavedMin": 0.01, # = 1% + "Weight0": 100, + "Weight1": 100, + "Weight2": 100, } diff --git a/Tensile/LibraryLogic.py b/Tensile/LibraryLogic.py index 881631f2b..72b8d60cb 100644 --- a/Tensile/LibraryLogic.py +++ b/Tensile/LibraryLogic.py @@ -12,7 +12,7 @@ ################################################################################ # Analyze Problem Type ################################################################################ -def analyzeProblemType( problemTypeTuple, analysisParameters ): +def analyzeProblemType( problemTypeTuple, inputParameters ): problemType = problemTypeTuple[0] problemSizes = problemTypeTuple[1] dataFileName = problemTypeTuple[2] @@ -23,6 +23,7 @@ def analyzeProblemType( problemTypeTuple, analysisParameters ): #print "# %s" % dataFileName #print "# %s" % solutionsFileName + ###################################### # Read Solutions (problemSizes, solutions) = YAMLIO.readSolutions(solutionsFileName) print2("# ProblemSizes: %s" % problemSizes) @@ -30,130 +31,45 @@ def analyzeProblemType( problemTypeTuple, analysisParameters ): print2("# Solutions:") solutionIdx = 0 for solution in solutions: - print2("# (%u) %s" % (solutionIdx, Solution.getNameMin(solution, solutionMinNaming))) + print2("# (%u) %s" % (solutionIdx, Solution.getNameMin(solution, \ + solutionMinNaming))) solutionIdx += 1 print2(HR) + ###################################### # Read Data From CSV - #numProblemSizes = problemSizes.numProblemSizes - data = BenchmarkDataAnalyzer(problemType, problemSizes, solutions, \ - analysisParameters) - data.populateFromCSV(dataFileName) - - ############################################################################## - # Determine Solutions Along Diagonal - # roughly same splitting regardless of sizeU - problemIndices = [] - for numProblemsForIndex in data.numProblemSizes: - problemIndices.append(numProblemsForIndex-1) - diagonalRules = data.getFastestSolutionsAlongDiagonal(problemIndices) - if True: - print2("Diagonal Rules:") - for rule in diagonalRules: - string = " if freeSize >=%4u" % data.problemIndexToSize[0][rule[1][0]] - for i in range(1, data.numIndices): - string += "x%4u" % data.problemIndexToSize[i][rule[1][i]] - string += " return S[%u] @ %5.0f-%5.0f>%5.0f GFlops is %s" \ - % (rule[0], rule[2], rule[3], rule[4], \ - data.solutionNames[rule[0]]) - print2(string) - - ############################################################################## - # Determine Skinny0 Solutions - skinnyRules01 = data.getSkinnySolutions(diagonalRules, problemIndices, \ - data.idx0, data.idx1) - #print "Skinny Rules:" - #for rule in skinnyRules01: - # string = " if freeSize >=%4u" % data.problemIndexToSize[0][rule[1][0]] - # for i in range(1, data.numIndices): - # string += "x%4u" % data.problemIndexToSize[i][rule[1][i]] - # string += " return S[%u] @ %5.0f-%5.0f>%5.0f GFlops is %s" \ - # % (rule[0], rule[2], rule[3], rule[4], \ - # data.solutionNames[rule[0]]) - - ############################################################################## - # Determine Skinny1 Solutions - skinnyRules10 = data.getSkinnySolutions(diagonalRules, problemIndices, \ - data.idx1, data.idx0) - - # list solutions that actually get used - solutionIndicesUsed = [] - for rule in skinnyRules01: - pass - for rule in skinnyRules10: - pass - for rule in diagonalRules: - solutionIdx = rule[0] - solution = solutions[solutionIdx] - MT0 = solution["MacroTile0"] - MT1 = solution["MacroTile1"] - DU = solution["DepthU"] - #print "Rule Tile S[%u]: %ux%ux%u" % (solutionIdx, MT0, MT1, DU) - # is this solution in the list - inList = False - for solutionUsed in solutionIndicesUsed: - if solutionUsed[0] == solutionIdx: - inList = True - break - if not inList: - insertIdx = len(solutionIndicesUsed) - for i in range(0, len(solutionIndicesUsed)): - iMT0 = solutionIndicesUsed[i][1] - iMT1 = solutionIndicesUsed[i][2] - iDU = solutionIndicesUsed[i][3] - #print " compare S[%u]: %ux%ux%u" % (solutionIndicesUsed[i][0], \ - # iMT0, iMT1, iDU) - if MT0*MT1 < iMT0*iMT1: - insertIdx = i - break - elif MT0*MT1 > iMT0*iMT1: - continue - else: # MT == MT - if DU < iDU: - insertIdx = i - break - else: - continue + logic = LogicAnalyzer(problemType, problemSizes, solutions, inputParameters) + logic.populateFromCSV(dataFileName) - # if i'm smaller than i, insert me before i - #print "insert: %u" % insertIdx - solutionIndicesUsed.insert(insertIdx, [solutionIdx, MT0, MT1, DU]) - #print solutionIndicesUsed + ###################################### + # Remove invalid solutions + logic.removeInvalidSolutions() - # list of solutions used - solutionsUsed = [] - for solutionIndexUsed in solutionIndicesUsed: - solutionsUsed.append(solutions[solutionIndexUsed[0]]) + ###################################### + # Remove least important solutions + logic.removeLeastImportantSolutions() - # translate rules to new solution indices - for rule in skinnyRules01: - pass - for rule in skinnyRules10: - pass - for ruleIdx in range(0, len(diagonalRules)): - solutionIdx = diagonalRules[ruleIdx][0] - for i in range(0, len(solutionIndicesUsed)): - solutionIndexUsed = solutionIndicesUsed[i] - if solutionIdx == solutionIndexUsed[0]: - diagonalRules[ruleIdx][0] = i - break - # change problemSizeIndices to sizes - for i in range(0, 3): - diagonalRules[ruleIdx][1][i] = \ - data.problemIndexToSize[i][ diagonalRules[ruleIdx][1][i] ] + ###################################### + # Correct outliers + # logic.smooth() + logic.print2D([0, 0]) + + ###################################### + # Create Rules + logic.enRule(0, logic.globalIndexRange) - print2("# New Rules: %s" % diagonalRules) #return (skinnyRules01, skinnyRules10, diagonalRules) - return (problemType, solutionsUsed, [], [], diagonalRules ) + #return (problemType, logic.solutionsUsed, [], [], logic.diagonalRules ) + return (problemType, [], [], [], [] ) ################################################################################ -# BenchmarkDataAnalyzer +# LogicAnalyzer ################################################################################ -class BenchmarkDataAnalyzer: +class LogicAnalyzer: ######################################## # diagonal rule looks like @@ -169,26 +85,39 @@ class BenchmarkDataAnalyzer: # 1: problemIndices for minThreshold problem # 2: gflops at above minSize + ############################################################################## + ############################################################################## + ### + ### Entry / Top-Level Functions + ### + ############################################################################## + ############################################################################## - def __init__(self, problemType, problemSizes, solutions, analysisParameters): + ############################################################################## + # ENTRY: Init + ############################################################################## + def __init__(self, problemType, problemSizes, solutions, inputParameters): self.problemType = problemType self.problemSizes = problemSizes - self.analysisParameters = analysisParameters + self.parameters = inputParameters print2("ProblemSizes: %s" % self.problemSizes) # TODO verify that data is symmetric for diagonal #if self.problemSizes[self.problemType["Index0"]] \ # != self.problemSizes[self.problemType["Index1"]]: # printExit("d0 / d1 must be symmetric for analysis.") self.numProblemSizes = problemSizes.numProblemSizes # native order - print2("NumProblemSizes: %s" % self.numProblemSizes) + print1("NumProblemSizes: %s" % self.numProblemSizes) self.numIndices = len(self.numProblemSizes) self.solutions = solutions self.numSolutions = len(self.solutions) self.solutionMinNaming = Solution.getMinNaming(solutions) self.solutionNames = [] + self.solutionTiles = [] for solution in self.solutions: self.solutionNames.append(Solution.getNameMin(solution, \ self.solutionMinNaming)) + self.solutionTiles.append("%ux%u"%(solution["MacroTile0"], solution["MacroTile1"])) + self.flopsPerMac = self.problemType["DataType"].flopsPerMac() # special indices self.idx0 = self.problemType["Index0"] @@ -229,13 +158,27 @@ def __init__(self, problemType, problemSizes, solutions, analysisParameters): currentSize += currentStride currentStride += index[2] idx += 1 + self.rangeIndicesFree = range(0, self.problemType["NumIndicesC"]) + self.rangeIndicesSummation = range(self.problemType["NumIndicesC"], \ + self.problemType["TotalIndices"]) + self.w0 = self.parameters["Weight0"] + self.w1 = self.parameters["Weight1"] + self.w2 = self.parameters["Weight2"] #print "S->I %s" % self.problemSizeToIndex #print "I->S %s" % self.problemIndexToSize + self.indexOrder = self.recommendedIndexOrder() + print2("IndexOrder: %s" % self.indexOrder) + self.globalIndexRange = [] + for i in range(0, self.numIndices): + self.globalIndexRange.append([0, self.numProblemSizes[i]]) + self.problemIndicesForGlobalRange \ + = self.problemIndicesForRange(self.globalIndexRange) ############################################################################## - # Read In CSV + # ENTRY: Read In CSV + ############################################################################## def populateFromCSV(self, dataFileName): # open file @@ -288,14 +231,473 @@ def populateFromCSV(self, dataFileName): ############################################################################## - # Get Fastest Solutions Along Diagonal (d0=d1) for largest sizes + # ENTRY: Remove Invalid Solutions + ############################################################################## + def removeInvalidSolutions(self): + #problemIndices = [0]*self.numIndices + allSolutionValid = False + while not allSolutionValid: + moreProblems = True + invalidIdx = -1 + for problemIndices in self.problemIndicesForGlobalRange: + problemIdx = self.indicesToSerial(0, problemIndices) + for solutionIdx in range(0, self.numSolutions): + gflops = self.data[problemIdx+solutionIdx] + if gflops == 0: + invalidIdx = solutionIdx + break + if invalidIdx >= 0: + print1("# Removing Invalid Solution: %u %s" \ + % (invalidIdx, self.solutionNames[invalidIdx]) ) + self.removeSolution(invalidIdx) + else: + allSolutionValid = True + + + ############################################################################## + # ENTRY: Remove Least Important Solutions + ############################################################################## + def removeLeastImportantSolutions(self): + # Remove least important solutions + while True: + (lisIdx, lisPercSaved, lisPercWins, lisPercExec) \ + = self.leastImportantSolution() + if lisPercSaved < self.parameters["FractionTimeSavedMin"]: + self.removeSolution(lisIdx) + continue + else: + break + + + ############################################################################## + # ENTRY: Smooth - correct outliers + ############################################################################## + def smooth(self): + outlierThreshold = self.parameters["OutlierThreshold"] + problemSizes = [0]*self.numIndices + for problemIndices in self.problemIndicesForGlobalRange: + problemIdx = self.indicesToSerial(0, problemIndices) + + for solutionIdx in range(0, self.numSolutions): + gflops = self.data[problemIdx+solutionIdx] + neighborGFlops = [] + smoothProblem = False + for iIdx in range(0, self.numIndices): + if problemIndices[iIdx] > 0 \ + and problemIndices[iIdx] < self.numProblemSizes[iIdx]-1: + neighborBeforeIndices = deepcopy(problemIndices) + neighborAfterIndices = deepcopy(problemIndices) + neighborBeforeIndices[iIdx] -= 1 + neighborAfterIndices[iIdx] += 1 + neighborBeforeIdx = self.indicesToSerial(0, neighborBeforeIndices) + neighborAfterIdx = self.indicesToSerial(0, neighborAfterIndices) + neighborBeforeGFlops = self.data[neighborBeforeIdx+solutionIdx] + neighborAfterGFlops = self.data[neighborAfterIdx+solutionIdx] + neighborGFlops.append(neighborBeforeGFlops) + neighborGFlops.append(neighborAfterGFlops) + if neighborBeforeGFlops > gflops * (1+outlierThreshold) \ + and neighborAfterGFlops * (1+outlierThreshold) < gflops : + smoothProblem = True + if smoothProblem: + s = "" + for i in range(0, self.numIndices): + problemSizes[i] = self.problemIndexToSize[i][problemIndices[i]] + s += "%u, " % problemSizes[i] + new = sum(neighborGFlops)/len(neighborGFlops) + old = self.data[problemIdx+solutionIdx] + s += "%f -> %f" % (old, new) + print s + self.data[problemIdx+solutionIdx] \ + = sum(neighborGFlops)/len(neighborGFlops) + + + ############################################################################## + # ENTRY: En Rule + # currentIndexIndex = 0, 1, 2, 3... + ############################################################################## + def enRule(self, currentIndexIndex, currentIndexRange): + currentIndex = self.indexOrder[currentIndexIndex] + lastIndex = currentIndexIndex == self.numIndices-1 + + # if there's only 1 problem size here + if currentIndexRange[currentIndex][1] \ + - currentIndexRange[currentIndex][0] == 1: + # this is last index, so just return fastest solution + if lastIndex: + scores = scoreRangeForSolutions(currentIndexRange) + winnerIdx = 0 + for solutionIdx in range(1, self.numSolution): + if scores[solutionIdx] < scores[winnerIdx]: + winnerIdx = solutionIdx + rule = [ -1, winnerIdx ] + # this isn't last index, so just return next index + else: + newIndexIndex = currentIndexIndex+1 + newIndexRange = deepcopy(currentIndexRange) + rule = [ -1, self.enRule(newIndexIndex, newIndexRange) ] + + # create rule for smallest size + + # for all problem indices in this index + for problemIndex in range(currentIndexRange[currentIndex][0], \ + currentIndexRange[currentIndex][1]): + # rules = seed with smallest rule + # for dimIdx = 0 -> numSizes + # if newRule + # score range using newRule + # score range using priorRule + # accept/reject based on score + # current index is dimOrder[0] + + + + + + sumValues = [] + totalSummationSizes = 1 + for i in self.rangeIndicesSummation: + totalSummationSizes *= self.numProblemSizes[i] + summationPermutations = [] + for permutationIdx in range(0, totalSummationSizes): + permutation = [] + permutationSize = 1 + pIdx = permutationIdx + for i in self.rangeIndicesSummation: + idx = pIdx % self.numProblemSizes[i] + permutation.append(idx) + permutationSize *= self.problemIndexToSize[i][idx] + pIdx /= self.numProblemSizes[i] + # insert permutation in sorted order + insertIdx = len(summationPermutations)-1 + for pIdx in range(0, len(summationPermutations)): + size = 1 + for i in self.rangeIndicesSummation: + size *= self.problemIndexToSize[i][summationPermutations[pIdx][i]] + if permutationSize > size: + insertIdx = pIdx + break + summationPermutations.insert(insertIdx, permutation) + print "SummationPermutations:", summationPermutations + + + if len(summationPermutations) == 1: + rules = [ 0, self.createRules01(summationPermutations[0]) ] + return rules + else: + printExit("No Logic to support multiple summation sizes.") + # iterate over summation permutations +# for each serial pair, scoreA, scoreB, scoreAB +# keep rule AB if scoreAB isn't much slower than scoreA + scoreB + + """ + sizeSummation *= self.problemIndexToSize[i][problemIndices[i]] + + firstProblemIndices = [] + lastProblemIndices = [] + for i in range(0, self.numIndices): + firstProblemIndices.append(0) + lastProblemIndices.append(self.numProblems[i]-1) + minSumValue = self.getSizeSummation(firstProblemIndices) + maxSumValue = self.getSizeSummation(lastProblemIndices) + numSumValues = + + + rule = [ + [ + minU, # k threshold + [[min01,s], [0,s]], # diagonals + [0, max0, [[min1,s], [min1,s]]], # skinny0's + [1, max1, [[min0,s], [min0,s]]], # skinny1's + ], + [ + minU, # k threshold + [[min01,s], [0,s]], # diagonals + [0, max0, [[min1,s], [min1,s]]], # skinny0's + [1, max1, [[min0,s], [min0,s]]], # skinny1's + ], + ] + + ruleA = createRules01() + ruleB = createRules01() + + minSumValue = 0 + maxSumValue = self.numProblems + + + sizeSummation = 1 + for i in range(self.problemType["NumIndicesC"], \ + self.problemType["TotalIndices"]): + sizeSummation *= self.problemIndexToSize[i][problemIndices[i]] + return sizeSummation + """ + + + + ############################################################################## + ############################################################################## + ### + ### Mid-Level Functions + ### + ############################################################################## + ############################################################################## + + + ############################################################################## - def getFastestSolutionsAlongDiagonal(self, problemIndices): - print2("\nFastest Diagonal idxU: %u" % problemIndices[self.idxU]) + # Create Rules dim0 / dim1 + ############################################################################## + def createRules01(self, problemSizeSummation ): + + diagonalRules = self.createRulesDiagonal(problemSizeSummation) + + + ############################################################################## + # Create Rules Diagonal + ############################################################################## + def createRulesDiagonal(self, problemSizeSummation): + thresholdForDiagonality = 1.5 # slightly fewer problems than 2 + numProblemSizesFastestDiagonal = 16 + problemIndices = [0]*self.numIndices + for i in self.rangeIndicesSummation: + problemIndices[i] = problemSizeSummation[i \ + - self.problemType["NumIndicesC"]] + print2("\nDiagonalRules for %s" % problemIndices) + problemSizes = [0]*self.numIndices + totalFlopsPerSizeFree = self.flopsPerMac + for i in self.rangeIndicesSummation: + totalFlopsPerSizeFree *= self.problemIndexToSize[i][problemIndices[i]] + print "totalFlopsPerSizeFree", totalFlopsPerSizeFree + + ######################################## + # transform data into serial list of "diagonal problem sizes" + diagonalData = [] + moreProblems = True + while moreProblems: + + # size free + for i in range(0, self.numIndices): + problemSizes[i] = self.problemIndexToSize[i][problemIndices[i]] + size0 = problemSizes[self.idx0] + size1 = problemSizes[self.idx1] + + # if diagonal + if size0 < size1*thresholdForDiagonality \ + and size1 < size0*thresholdForDiagonality: + sizeFree = self.getSizeFree(problemIndices) + + problemIdx = self.indicesToSerial(0, problemIndices) + solutionGFlops = [] + for i in range(0, self.numSolutions): + solutionGFlops.append(self.data[problemIdx+i]) + + diagonalData.append([ sizeFree, solutionGFlops ]) + + # next problem + problemIndices[0] += 1 + for i in self.rangeIndicesFree: + if problemIndices[i] >= self.numProblemSizes[i]: + if i == self.problemType["NumIndicesFree"]-1: + moreProblems = False + break + else: + problemIndices[i] = 0 + problemIndices[i+1] += 1 + else: + break + + diagonalData.sort(key=lambda x: x[0], reverse=True) + for dd in diagonalData: + print "DD[%u]: %s" % (dd[0], dd[1]) + print len(diagonalData) + + + ######################################## + # create first rule + sizeFree = diagonalData[0][0] + relativeTime = [0]*self.numSolutions + for i in range(0, numProblemSizesFastestDiagonal): + for j in range(0, self.numSolutions): + gflops = diagonalData[i][1][j] + relativeTime[j] += 1 / gflops + winnerIdx = 0 + winnerRelativeTime = relativeTime[0] + for i in range(1, self.numSolutions): + if relativeTime[i] < winnerRelativeTime: + winnerIdx = i + winnerRelativeTime = relativeTime[i] + print "FastestDiagonalSolution:", winnerIdx, self.solutionNames[winnerIdx] + fastestGFlops = 0 + for i in range(0, numProblemSizesFastestDiagonal): + gflops = diagonalData[i][1][winnerIdx] + if gflops > fastestGFlops: + fastestGFlops = gflops + + rules = [] + # minGFlops maxGFlops oldGFlops? + rules.append([winnerIdx, sizeFree, fastestGFlops, fastestGFlops, -1]) + print "Winner[%3u]: %u" % (0, winnerIdx) +# we can't just pay attention to single winner +# we need to compute scores for all solutions over a window +# b/c 441115111333 +# = 441555555333 +# +# we can do a smoothing pass to get rid of bogus data; if a data point is more than x% slower than 4 surrounding points, than its bogus, just set it equal to average of 4 surrounding points +# + + ######################################## + # create subsequent rules for smaller sizes + for diagonalDataIdx in range(1, len(diagonalData)): + print "DiagonalDataIdx:", diagonalDataIdx + # prior rule + priorRule = rules[len(rules)-1] + priorWinnerIdx = priorRule[0] + # candidate winner + candidateWinnerIdx = 0 + candidateWinnerGFlops = diagonalData[diagonalDataIdx][1][0] + for j in range(1, self.numSolutions): + gflops = diagonalData[diagonalDataIdx][1][j] + if gflops > candidateWinnerGFlops: + candidateWinnerIdx = j + candidateWinnerGFlops = gflops + if candidateWinnerIdx == priorWinnerIdx: + # update prior rule to include this sizeFree + rules[len(rules)-1][1] = diagonalData[diagonalDataIdx][0] # size free + rules[len(rules)-1][2] = \ + diagonalData[diagonalDataIdx][1][priorWinnerIdx] # perf at size + continue + else: + # candidate rule + sizeFree = diagonalData[diagonalDataIdx][0] + totalFlops = sizeFree*totalFlopsPerSizeFree + candidateGFlops = diagonalData[diagonalDataIdx][1][candidateWinnerIdx] + priorGFlops = diagonalData[diagonalDataIdx][1][priorWinnerIdx] + candidateRule = [ candidateWinnerIdx, sizeFree, candidateGFlops, \ + candidateGFlops, -1 ] + # candidate and prior scores + candidateTimeUs = totalFlops / candidateGFlops / 1000 + priorTimeUs = totalFlops / priorGFlops / 1000 + candidateScore = 1*self.w2 + candidateTimeUs + priorScore = 0*self.w2 + priorTimeUs + print "DDI[%3u] Prior[%2u]: %.0fus vs Candi[%2u]: %.0fus" \ + % (diagonalDataIdx, priorWinnerIdx, priorScore, candidateWinnerIdx, candidateScore) + checkMoreProblems = True + for newDiagonalDataIdx in range(diagonalDataIdx+1, len(diagonalData)): + newWinnerIdx = 0 + newWinnerGFlops = diagonalData[newDiagonalDataIdx][1][0] + for j in range(1, self.numSolutions): + gflops = diagonalData[newDiagonalDataIdx][1][j] + if gflops > newWinnerGFlops: + newWinnerIdx = j + newWinnerGFlops = gflops + # update candidate and prior scores + sizeFree = diagonalData[newDiagonalDataIdx][0] + totalFlops = sizeFree*totalFlopsPerSizeFree + candidateGFlops = \ + diagonalData[newDiagonalDataIdx][1][candidateWinnerIdx] + priorGFlops = diagonalData[newDiagonalDataIdx][1][priorWinnerIdx] + candidateTimeUs = totalFlops / candidateGFlops / 1000 + priorTimeUs = totalFlops / priorGFlops / 1000 + candidateScore += candidateTimeUs + priorScore += priorTimeUs + print " NDDI[%3u] Prior[%2u]: %.0fus vs Candi[%2u]: %.0fus" \ + % (newDiagonalDataIdx, priorWinnerIdx, priorScore, \ + candidateWinnerIdx, candidateScore) + if newWinnerIdx == candidateWinnerIdx: + print " newWinnerIdx == candidateWinnerIdx" + if candidateScore < priorScore: + # candidate rule accepted + rules.append(candidateRule) + print " accepting" + break + else: + # candidate rule not yet accepted + candidateRule[1] = sizeFree + candidateRule[2] = candidateGFlops + print " continuing" + continue + elif newWinnerIdx == priorWinnerIdx: + print " newWinnerIdx == priorWinnerIdx" + # returned to original winner, decide now to accept/reject + if candidateScore < priorScore: + # candidate rule accepted + rules.append(candidateRule) + print " accepting" + break + else: + # candidate rule rejected; update prior, continue at newSize + rules[len(rules)-1][1] = sizeFree + rules[len(rules)-1][2] = priorGFlops + diagonalDataIdx = newDiagonalDataIdx + print " rejecting" + break + else: + print " newWinnerIdx is %u" % newWinnerIdx + # new winner was a 3rd solution; decide now (same as above) + if candidateScore < priorScore: + # candidate rule accepted + rules.append(candidateRule) + print " accepting" + break + else: + # candidate rule rejected; update prior, continue at newSize + rules[len(rules)-1][1] = diagonalData[newDiagonalDataIdx][0] + rules[len(rules)-1][2] = \ + diagonalData[newDiagonalDataIdx][1][priorWinnerIdx] + diagonalDataIdx = newDiagonalDataIdx + print " rejecting" + break + + return + + # go farther forward, does candidate rule keep winning, or does priorRule keep winning? + # the new rule should start at a loss b/c of Weight2 + # a few problems in the future + # if new rule is better, W2 gets amortized, Wt improves + # if new rule is worse, W2 gets amortized, Wt worsens + # continue to future problems until, and make final decision + # newRule gets better score; accept + # return to priorRule winner; accept/reject + # Yet a new winner + # easy: make final accept/reject including this new problem size + # hard: recure? + # + # is the num problems in future vary with W2,Wt? +# Wt = 1 +# W2 = 1 means we would rather lose 1us per kernel rather than adding another split (actually they're equal) +# so, in order for candidate to be accepted immediately, it must improve all kernels by more than 1us, or after 2 sizes, improve by 0.5us per kernel +# +# +# 0 0 1 0 0 +# 0 0 1 1 0 +# 0 0 1 4 0 +# 0 0 1 4 1 0 +# + + print "Winner[%3u]: %u" % (i, winnerIdx) + + + return + + + + + + + + + + + + + + + + + # abstract to multidimensions # what is the diagonal - dilation = self.analysisParameters["Dilation"] - threshold = self.analysisParameters["Threshold"] + dilation = self.self.parameters["Dilation"] + threshold = self.self.parameters["Threshold"] numProblems0 = self.numProblemSizes[self.idx0] ############################################################################ @@ -429,8 +831,8 @@ def getSkinnySolutions(self, diagonalRules, problemIndices, \ idx0 = self.idx0 idx1 = self.idx1 #idxU = self.idxU - #dilation = self.analysisParameters["Dilation"] - threshold = self.analysisParameters["Threshold"] + #dilation = self.self.parameters["Dilation"] + threshold = self.self.parameters["Threshold"] skinnyRules = [] @@ -500,42 +902,374 @@ def getSkinnySolutions(self, diagonalRules, problemIndices, \ return skinnyRules # end skinny solutions + ############################################################################## - # Get Size Free and Summation + # Determine Logic Along U ############################################################################## - def getWinnerForProblem(self, problemIndices): - problemIdx = self.indicesToSerial(0, problemIndices) - winnerIdx = -1 - winnerGFlops = -1 - for solutionIdx in range(0, self.numSolutions): - solutionSerialIdx = problemIdx + solutionIdx - solutionGFlops = self.data[solutionSerialIdx] - if solutionGFlops > winnerGFlops: - #print "%f > %f" % (solutionGFlops, winnerGFlops) - winnerIdx = solutionIdx - winnerGFlops = solutionGFlops - return (winnerIdx, winnerGFlops) + def determineLogicAlongU(self): + globalRange = [] + for i in range(0, self.numIndices): + globalRange.append( [0, self.numProblemSizes[i]] ) + + + + + self.print2D([0, 0]) + + ############################################################################ + # Determine Solutions Along Diagonal + # roughly same splitting regardless of sizeU + problemIndices = [] + for numProblemsForIndex in self.numProblemSizes: + problemIndices.append(numProblemsForIndex-1) + print problemIndices + self.diagonalRules = self.getFastestSolutionsAlongDiagonal(problemIndices) + if True: + print2("Diagonal Rules:") + for rule in self.diagonalRules: + string = " if freeSize >=%4u" % self.problemIndexToSize[0][rule[1][0]] + for i in range(1, self.numIndices): + string += "x%4u" % self.problemIndexToSize[i][rule[1][i]] + string += " return S[%u] @ %5.0f-%5.0f>%5.0f GFlops is %s" \ + % (rule[0], rule[2], rule[3], rule[4], \ + self.solutionNames[rule[0]]) + print2(string) + + ############################################################################ + # Determine Skinny0 Solutions + skinnyRules01 = self.getSkinnySolutions(self.diagonalRules, problemIndices, \ + self.idx0, self.idx1) + #print "Skinny Rules:" + #for rule in skinnyRules01: + # string = " if freeSize >=%4u" % data.problemIndexToSize[0][rule[1][0]] + # for i in range(1, data.numIndices): + # string += "x%4u" % data.problemIndexToSize[i][rule[1][i]] + # string += " return S[%u] @ %5.0f-%5.0f>%5.0f GFlops is %s" \ + # % (rule[0], rule[2], rule[3], rule[4], \ + # data.solutionNames[rule[0]]) + + ############################################################################ + # Determine Skinny1 Solutions + skinnyRules10 = self.getSkinnySolutions(self.diagonalRules, problemIndices, \ + self.idx1, self.idx0) + + # list solutions that actually get used + solutionIndicesUsed = [] + for rule in skinnyRules01: + pass + for rule in skinnyRules10: + pass + for rule in self.diagonalRules: + solutionIdx = rule[0] + solution = self.solutions[solutionIdx] + MT0 = solution["MacroTile0"] + MT1 = solution["MacroTile1"] + DU = solution["DepthU"] + #print "Rule Tile S[%u]: %ux%ux%u" % (solutionIdx, MT0, MT1, DU) + # is this solution in the list + inList = False + for solutionUsed in solutionIndicesUsed: + if solutionUsed[0] == solutionIdx: + inList = True + break + if not inList: + insertIdx = len(solutionIndicesUsed) + for i in range(0, len(solutionIndicesUsed)): + iMT0 = solutionIndicesUsed[i][1] + iMT1 = solutionIndicesUsed[i][2] + iDU = solutionIndicesUsed[i][3] + #print " compare S[%u]: %ux%ux%u" % (solutionIndicesUsed[i][0], \ + # iMT0, iMT1, iDU) + if MT0*MT1 < iMT0*iMT1: + insertIdx = i + break + elif MT0*MT1 > iMT0*iMT1: + continue + else: # MT == MT + if DU < iDU: + insertIdx = i + break + else: + continue + + # if i'm smaller than i, insert me before i + #print "insert: %u" % insertIdx + solutionIndicesUsed.insert(insertIdx, [solutionIdx, MT0, MT1, DU]) + #print solutionIndicesUsed + + # list of solutions used + self.solutionsUsed = [] + for solutionIndexUsed in solutionIndicesUsed: + self.solutionsUsed.append(self.solutions[solutionIndexUsed[0]]) + + # translate rules to new solution indices + for rule in skinnyRules01: + pass + for rule in skinnyRules10: + pass + for ruleIdx in range(0, len(self.diagonalRules)): + solutionIdx = self.diagonalRules[ruleIdx][0] + for i in range(0, len(solutionIndicesUsed)): + solutionIndexUsed = solutionIndicesUsed[i] + if solutionIdx == solutionIndexUsed[0]: + self.diagonalRules[ruleIdx][0] = i + break + # change problemSizeIndices to sizes + for i in range(0, 3): + self.diagonalRules[ruleIdx][1][i] = \ + self.problemIndexToSize[i][ self.diagonalRules[ruleIdx][1][i] ] + + print2("# New Rules: %s" % self.diagonalRules) + ############################################################################## - # Get Size Free and Summation ############################################################################## - def getSizeFree(self, problemIndices): - sizeFree = 1 - for i in range(0, self.problemType["NumIndicesC"]): - sizeFree *= self.problemIndexToSize[i][problemIndices[i]] - return sizeFree + ### + ### Helper / Low-Level Functions + ### + ############################################################################## + ############################################################################## + - def getSizeSummation(self, problemIndices): - sizeSummation = 1 - for i in range(self.problemType["NumIndicesC"], \ - self.problemType["TotalIndices"]): - sizeSummation *= self.problemIndexToSize[i][problemIndices[i]] - return sizeSummation ############################################################################## - # Print Data + # Print2D + def print2D(self, indices ): + indicesIdx = 0 + problemIndices = [] + for i in range(0, self.numIndices): + if i == self.idx0: + problemIndices.append(-1) + elif i == self.idx1: + problemIndices.append(-1) + else: + problemIndices.append(indices[indicesIdx]) + indicesIdx += 1 + + winnerIndices = [] + w = "winner" + g = "gflops" + f = "faster" + s = "second" + sss = [] + for sIdx in range(0, self.numSolutions): + sss.append("Sol[%u]" % sIdx) + for j in range(0, self.numProblemSizes[1]): + w += ",%4u" % self.problemIndexToSize[1][j] + g += ",%4u" % self.problemIndexToSize[1][j] + f += ",%4u" % self.problemIndexToSize[1][j] + s += ",%4u" % self.problemIndexToSize[1][j] + for sIdx in range(0, self.numSolutions): + sss[sIdx] += ",%4u" % self.problemIndexToSize[1][j] + w += "\n" + g += "\n" + f += "\n" + s += "\n" + for sIdx in range(0, self.numSolutions): + sss[sIdx] += "\n" + for i in range(0, self.numProblemSizes[0]): + problemIndices[self.idx0] = i + w += "%4u" % self.problemIndexToSize[0][i] + g += "%4u" % self.problemIndexToSize[0][i] + f += "%4u" % self.problemIndexToSize[0][i] + s += "%4u" % self.problemIndexToSize[0][i] + for sIdx in range(0, self.numSolutions): + sss[sIdx] += "%4u" % self.problemIndexToSize[0][i] + for j in range(0, self.numProblemSizes[1]): + problemIndices[self.idx1] = j + problemIdx = self.indicesToSerial(0, problemIndices) + for sIdx in range(0, self.numSolutions): + sss[sIdx] += ",%f" % self.data[problemIdx+sIdx] + + if self.data[problemIdx+0] > self.data[problemIdx+1]: + winnerIdx = 0 + winnerGFlops = self.data[problemIdx+0] + secondIdx = 1 + secondGFlops = self.data[problemIdx+1] + else: + winnerIdx = 1 + winnerGFlops = self.data[problemIdx+1] + secondIdx = 0 + secondGFlops = self.data[problemIdx+0] + for solutionIdx in range(2, self.numSolutions): + solutionSerialIdx = problemIdx + solutionIdx + solutionGFlops = self.data[solutionSerialIdx] + if solutionGFlops > winnerGFlops: + #print "%f > %f" % (solutionGFlops, winnerGFlops) + secondIdx = winnerIdx + secondGFlops = winnerGFlops + winnerIdx = solutionIdx + winnerGFlops = solutionGFlops + + + if winnerIdx not in winnerIndices: + winnerIndices.append(winnerIdx) + w += ",%4u" % winnerIdx + g += ",%f" % winnerGFlops + f += ",%f" % (winnerGFlops/secondGFlops) + s += ",%4u" % (secondIdx) + w += "\n" + g += "\n" + f += "\n" + s += "\n" + for sIdx in range(0, self.numSolutions): + sss[sIdx] += "\n" + + w += "\n\n" + g += "\n\n" + f += "\n\n" + s += "\n\n" + for sIdx in range(0, self.numSolutions): + sss[sIdx] += "\n\n" + w += "Winners:\n" + for winnerIdx in winnerIndices: + w += "%4u, %s, %s\n" % (winnerIdx, self.solutionTiles[winnerIdx], self.solutionNames[winnerIdx]) + + printFileName = "Winner2D" + for idx in indices: + printFileName += "_%u" % idx + printFileName += ".csv" + printFile = open(os.path.join(globalParameters["WorkingPath"], printFileName), "w") + printFile.write( w ) + printFile.write( g ) + printFile.write( f ) + printFile.write( s ) + for sIdx in range(0, self.numSolutions): + printFile.write( sss[sIdx] ) + printFile.close() + + + ############################################################################## + # Least Important Solution + def leastImportantSolution(self): + solutionImportance = [] + for i in range(0, self.numSolutions): + solutionImportance.append([i, 0, 0, 0]) + problemSizes = [0]*self.numIndices + totalSavedMs = 0 + totalExecMs = 0 + totalWins = 0 + for problemIndices in self.problemIndicesForGlobalRange: + for i in range(0, self.numIndices): + problemSizes[i] = self.problemIndexToSize[i][problemIndices[i]] + totalFlops = self.flopsPerMac + for size in problemSizes: + totalFlops *= size + + problemIdx = self.indicesToSerial(0, problemIndices) + if self.data[problemIdx+0] > self.data[problemIdx+1]: + winnerIdx = 0 + winnerGFlops = self.data[problemIdx+0] + secondIdx = 1 + secondGFlops = self.data[problemIdx+1] + else: + winnerIdx = 1 + winnerGFlops = self.data[problemIdx+1] + secondIdx = 0 + secondGFlops = self.data[problemIdx+0] + + for solutionIdx in range(2, self.numSolutions): + solutionSerialIdx = problemIdx + solutionIdx + solutionGFlops = self.data[solutionSerialIdx] + if solutionGFlops > winnerGFlops: + secondIdx = winnerIdx + secondGFlops = winnerGFlops + winnerIdx = solutionIdx + winnerGFlops = solutionGFlops + winnerTimeMs = totalFlops / winnerGFlops / 1000000 + secondTimeMs = totalFlops / secondGFlops / 1000000 + solutionImportance[winnerIdx][1] += (secondTimeMs - winnerTimeMs) + solutionImportance[winnerIdx][2] += 1 + solutionImportance[winnerIdx][3] += winnerTimeMs + + totalSavedMs += secondTimeMs - winnerTimeMs + totalExecMs += winnerTimeMs + totalWins += 1 + solutionImportance.sort(key=lambda x: x[1]) + return ( solutionImportance[0][0], \ + solutionImportance[0][1] / totalSavedMs, \ + solutionImportance[0][2] / totalWins, \ + solutionImportance[0][3] / totalExecMs ) + + + ############################################################################## + # Score Range For Logic + def scoreRangeForLogic(self, indexRange, logic): + pass + + ############################################################################## + # Score (microseconds) Range For Solutions + def scoreRangeForSolutions(self, indexRange): + scores = [0]*self.numSolutions + for problemIndices in self.problemIndicesForRange(indexRange): + problemSerial = self.indicesToSerial(0, problemIndices) + totalFlops = self.totalFlopsForProblemIndices(problemIndices) + for solutionIdx in range(0, self.numSolutions): + gflops = self.data[problemSerial+solutionIdx] + timeUs = totalFlops / gflops / 1000 + scores[solutionIdx] += timeUs + return scores + + ############################################################################## + # Total Flops For Problem Indices + def totalFlopsForProblemIndices(self, problemIndices): + totalFlops = self.flopsPerMac + for i in range(0, self.numIndices): + totalFlops *= self.problemIndexToSize[i][problemIndices[i]] + return totalFlops + ############################################################################## + # Remove Solution + def removeSolution(self, removeSolutionIdx): + + # temporarily move current to old + oldSolutions = self.solutions + oldNumSolutions = self.numSolutions + oldData = self.data + oldTotalSize = self.totalSize + + # update solutions + self.solutions = [] + for i in range(0, oldNumSolutions): + if i != removeSolutionIdx: + self.solutions.append(oldSolutions[i]) + self.solutionMinNaming = Solution.getMinNaming(self.solutions) + self.solutionNames = [] + self.solutionTiles = [] + for solution in self.solutions: + self.solutionNames.append(Solution.getNameMin(solution, \ + self.solutionMinNaming)) + self.solutionTiles.append("%ux%u"%(solution["MacroTile0"], \ + solution["MacroTile1"])) + self.numSolutions = len(self.solutions) + + # update data + self.totalSize = self.totalProblems * self.numSolutions + self.data = array.array('f', [0]*self.totalSize) + for problemIdx in range(0, self.totalProblems): + newSolutionIdx = 0 + for oldSolutionIdx in range(0, oldNumSolutions): + if oldSolutionIdx != removeSolutionIdx: + self.data[problemIdx*self.numSolutions+newSolutionIdx] \ + = oldData[problemIdx*oldNumSolutions+oldSolutionIdx] + newSolutionIdx += 1 + + ############################################################################## + # Recommended Index Order + # TODO, this may depend on transposes + def recommendedIndexOrder(self): + order = [] + for i in range(0, self.numIndices): + if i != self.idxU and i != self.idx1 and i != self.idx0: + order.append(i) + order.append(self.idxU) + order.append(self.idx0) + order.append(self.idx1) + return order + + ############################################################################## + # Print Data def printData(self): print2("serial; idxD0, idxD1, idxDU, idxOthers; sizeD0, sizeD1, sizeDU, sizeOthers; sol0, sol1, sol2, ...") indices = [0]*self.numIndices @@ -557,26 +1291,83 @@ def printData(self): indices[i] += 1 ############################################################################## - # Get Item + # Problem Indices For Range + def problemIndicesForRange(self, indexRange): + problemIndexList = [] + problemIndices = [] + for idx in indexRange: + problemIndices.append(idx[0]) + moreProblems = True + while moreProblems: + problemIndexList.append(deepcopy(problemIndices)) + # next problem + problemIndices[0] += 1 + for i in range(0, self.numIndices): + if problemIndices[i] >= indexRange[i][1]: + if i == self.numIndices-1: + moreProblems = False + break + else: + problemIndices[i] = indexRange[i][0] + problemIndices[i+1] += 1 + else: + break + return problemIndexList + ############################################################################## + # Get Winner For Problem + def getWinnerForProblem(self, problemIndices): + problemIdx = self.indicesToSerial(0, problemIndices) + winnerIdx = -1 + winnerGFlops = -1 + for solutionIdx in range(0, self.numSolutions): + solutionSerialIdx = problemIdx + solutionIdx + solutionGFlops = self.data[solutionSerialIdx] + if solutionGFlops > winnerGFlops: + #print "%f > %f" % (solutionGFlops, winnerGFlops) + winnerIdx = solutionIdx + winnerGFlops = solutionGFlops + return (winnerIdx, winnerGFlops) + + + ############################################################################## + # Get Size Free + def getSizeFree(self, problemIndices): + sizeFree = 1 + for i in self.rangeIndicesFree: + sizeFree *= self.problemIndexToSize[i][problemIndices[i]] + return sizeFree + + + ############################################################################## + # Get Size Summation + def getSizeSummation(self, problemIndices): + sizeSummation = 1 + for i in self.rangeIndicesSummation: + sizeSummation *= self.problemIndexToSize[i][problemIndices[i]] + return sizeSummation + + + ############################################################################## + # Get Item def __getitem__(self, indexTuple): indices = indexTuple[0] # in analysis order solutionIdx = indexTuple[1] serial = self.indicesToSerial(solutionIdx, indices) return self.data[serial] + ############################################################################## - # Get Item - ############################################################################## + # Set Item def __setitem__(self, indexTuple, value): indices = indexTuple[0] # in analysis order solutionIdx = indexTuple[1] serial = self.indicesToSerial(solutionIdx, indices ) self.data[serial] = value + ############################################################################## # Indices -> Serial - ############################################################################## def indicesToSerial(self, solutionIdx, indices ): serial = 0 stride = 1 @@ -588,8 +1379,13 @@ def indicesToSerial(self, solutionIdx, indices ): return serial + +################################################################################ +################################################################################ +### +### Main +### ################################################################################ -# Main ################################################################################ def main( config ): print2("# LibraryLogic config: %s" % config) diff --git a/Tensile/SolutionStructs.py b/Tensile/SolutionStructs.py index 7dc27a644..795c32022 100644 --- a/Tensile/SolutionStructs.py +++ b/Tensile/SolutionStructs.py @@ -123,6 +123,8 @@ def numRegisters( self ): return self.properties[self.value][self.idxReg] def numBytes( self ): return self.numRegisters() * 4 + def flopsPerMac(self): + return 1 if self.isReal() else 8 def __str__(self): return self.toChar() From 35ba76651df0896e44502b5c53fd908e0399515e Mon Sep 17 00:00:00 2001 From: David Tanner Date: Wed, 1 Mar 2017 15:59:02 -0600 Subject: [PATCH 13/21] recursive logic runs to completion --- Tensile/Common.py | 3 + Tensile/LibraryLogic.py | 921 +++++++++------------------------------- 2 files changed, 206 insertions(+), 718 deletions(-) diff --git a/Tensile/Common.py b/Tensile/Common.py index 557690f79..9b04526c0 100644 --- a/Tensile/Common.py +++ b/Tensile/Common.py @@ -152,6 +152,9 @@ # Default Analysis Parameters ################################################################################ defaultAnalysisParameters = { + "InitialSolutionWindow": 4, + "BranchWeight": 100, # microseconds / kernel + "Dilation": 3, "Threshold": 0.1, "OutlierThreshold": 0, diff --git a/Tensile/LibraryLogic.py b/Tensile/LibraryLogic.py index 72b8d60cb..9dfeba30b 100644 --- a/Tensile/LibraryLogic.py +++ b/Tensile/LibraryLogic.py @@ -240,9 +240,9 @@ def removeInvalidSolutions(self): moreProblems = True invalidIdx = -1 for problemIndices in self.problemIndicesForGlobalRange: - problemIdx = self.indicesToSerial(0, problemIndices) + problemSerial = self.indicesToSerial(0, problemIndices) for solutionIdx in range(0, self.numSolutions): - gflops = self.data[problemIdx+solutionIdx] + gflops = self.data[problemSerial+solutionIdx] if gflops == 0: invalidIdx = solutionIdx break @@ -276,10 +276,10 @@ def smooth(self): outlierThreshold = self.parameters["OutlierThreshold"] problemSizes = [0]*self.numIndices for problemIndices in self.problemIndicesForGlobalRange: - problemIdx = self.indicesToSerial(0, problemIndices) + problemSerial = self.indicesToSerial(0, problemIndices) for solutionIdx in range(0, self.numSolutions): - gflops = self.data[problemIdx+solutionIdx] + gflops = self.data[problemSerial+solutionIdx] neighborGFlops = [] smoothProblem = False for iIdx in range(0, self.numIndices): @@ -304,132 +304,107 @@ def smooth(self): problemSizes[i] = self.problemIndexToSize[i][problemIndices[i]] s += "%u, " % problemSizes[i] new = sum(neighborGFlops)/len(neighborGFlops) - old = self.data[problemIdx+solutionIdx] + old = self.data[problemSerial+solutionIdx] s += "%f -> %f" % (old, new) print s - self.data[problemIdx+solutionIdx] \ + self.data[problemSerial+solutionIdx] \ = sum(neighborGFlops)/len(neighborGFlops) ############################################################################## # ENTRY: En Rule # currentIndexIndex = 0, 1, 2, 3... + # currentIndexRange will have only 1 size for prior indices (unless initial) + # + # Rule: + # [128, [ + # [64, [ + # [16, 0], + # [2880,1] + # ] + # ], + # [96, [ + # [16, 0], + # [64, 1] + # ] + # ] + # ] + # ], another + # + # ############################################################################## def enRule(self, currentIndexIndex, currentIndexRange): + tab = "" + for i in range(0, currentIndexIndex): + tab += " " + print "%senRule(%u, %s)" % (tab, currentIndexIndex, currentIndexRange) currentIndex = self.indexOrder[currentIndexIndex] - lastIndex = currentIndexIndex == self.numIndices-1 + nextIndexIndex = currentIndexIndex+1 + nextIndexRange = deepcopy(currentIndexRange) + isLastIndex = currentIndexIndex == self.numIndices-1 # if there's only 1 problem size here if currentIndexRange[currentIndex][1] \ - currentIndexRange[currentIndex][0] == 1: + # this is last index, so just return fastest solution - if lastIndex: - scores = scoreRangeForSolutions(currentIndexRange) - winnerIdx = 0 - for solutionIdx in range(1, self.numSolution): - if scores[solutionIdx] < scores[winnerIdx]: - winnerIdx = solutionIdx - rule = [ -1, winnerIdx ] + if isLastIndex: + # optimize b/c this should be only single problem + #scores = self.scoreRangeForSolutions(currentIndexRange) + #winnerIdx = 0 + #for solutionIdx in range(1, self.numSolution): + # if scores[solutionIdx] < scores[winnerIdx]: + # winnerIdx = solutionIdx + winnerIdx = self.winnerForRange(currentIndexRange) + print "%s returning early winner=%u" % (tab, winnerIdx) + return [ -1, winnerIdx ] + # this isn't last index, so just return next index else: - newIndexIndex = currentIndexIndex+1 - newIndexRange = deepcopy(currentIndexRange) - rule = [ -1, self.enRule(newIndexIndex, newIndexRange) ] - - # create rule for smallest size - - # for all problem indices in this index - for problemIndex in range(currentIndexRange[currentIndex][0], \ - currentIndexRange[currentIndex][1]): - # rules = seed with smallest rule - # for dimIdx = 0 -> numSizes - # if newRule - # score range using newRule - # score range using priorRule - # accept/reject based on score - # current index is dimOrder[0] - - - + print "%s returning early enRule(%u,%s)" \ + % (tab, nextIndexIndex, nextIndexRange) + return [ -1, self.enRule(nextIndexIndex, nextIndexRange) ] + # ruleList + ruleList = [] - sumValues = [] - totalSummationSizes = 1 - for i in self.rangeIndicesSummation: - totalSummationSizes *= self.numProblemSizes[i] - summationPermutations = [] - for permutationIdx in range(0, totalSummationSizes): - permutation = [] - permutationSize = 1 - pIdx = permutationIdx - for i in self.rangeIndicesSummation: - idx = pIdx % self.numProblemSizes[i] - permutation.append(idx) - permutationSize *= self.problemIndexToSize[i][idx] - pIdx /= self.numProblemSizes[i] - # insert permutation in sorted order - insertIdx = len(summationPermutations)-1 - for pIdx in range(0, len(summationPermutations)): - size = 1 - for i in self.rangeIndicesSummation: - size *= self.problemIndexToSize[i][summationPermutations[pIdx][i]] - if permutationSize > size: - insertIdx = pIdx - break - summationPermutations.insert(insertIdx, permutation) - print "SummationPermutations:", summationPermutations - + # create rule for smallest size + initialSize = min(currentIndexRange[currentIndex][0] \ + + self.parameters["InitialSolutionWindow"], \ + self.numProblemSizes[currentIndex]) + nextIndexRange[currentIndex][1] = initialSize + if isLastIndex: + winnerIdx = self.winnerForRange(nextIndexRange) + initialRule = [ currentIndexRange[currentIndex][0], winnerIdx] - if len(summationPermutations) == 1: - rules = [ 0, self.createRules01(summationPermutations[0]) ] - return rules else: - printExit("No Logic to support multiple summation sizes.") - # iterate over summation permutations -# for each serial pair, scoreA, scoreB, scoreAB -# keep rule AB if scoreAB isn't much slower than scoreA + scoreB + initialRule = [ currentIndexRange[currentIndex][0], \ + self.enRule(nextIndexIndex, nextIndexRange) ] + ruleList.append(initialRule) - """ - sizeSummation *= self.problemIndexToSize[i][problemIndices[i]] + # for all problem indices in this index - firstProblemIndices = [] - lastProblemIndices = [] - for i in range(0, self.numIndices): - firstProblemIndices.append(0) - lastProblemIndices.append(self.numProblems[i]-1) - minSumValue = self.getSizeSummation(firstProblemIndices) - maxSumValue = self.getSizeSummation(lastProblemIndices) - numSumValues = - - - rule = [ - [ - minU, # k threshold - [[min01,s], [0,s]], # diagonals - [0, max0, [[min1,s], [min1,s]]], # skinny0's - [1, max1, [[min0,s], [min0,s]]], # skinny1's - ], - [ - minU, # k threshold - [[min01,s], [0,s]], # diagonals - [0, max0, [[min1,s], [min1,s]]], # skinny0's - [1, max1, [[min0,s], [min0,s]]], # skinny1's - ], - ] - - ruleA = createRules01() - ruleB = createRules01() - - minSumValue = 0 - maxSumValue = self.numProblems + for problemIndex in range(currentIndexRange[currentIndex][0], \ + currentIndexRange[currentIndex][1]): + print "%s pIdx: %u" % (tab, problemIndex) + nextIndexRange[currentIndex][0] = problemIndex + nextIndexRange[currentIndex][1] = problemIndex+1 + if isLastIndex: + winnerIdx = self.winnerForRange(currentIndexRange) + candidateRule = [ currentIndexRange[currentIndex][0], winnerIdx] + else: + candidateRule = [ problemIndex, self.enRule(nextIndexIndex, \ + nextIndexRange) ] + priorRule = ruleList[len(ruleList)-1] + priorRuleScore = self.scoreRangeForLogic(nextIndexRange, priorRule) + candidateRuleScore = self.scoreRangeForLogic(nextIndexRange, \ + candidateRule) + candidateRuleScore += self.parameters["BranchWeight"] # penalize + if candidateRuleScore < priorRuleScore: + ruleList.append(candidateRule) - sizeSummation = 1 - for i in range(self.problemType["NumIndicesC"], \ - self.problemType["TotalIndices"]): - sizeSummation *= self.problemIndexToSize[i][problemIndices[i]] - return sizeSummation - """ + return ruleList @@ -443,585 +418,6 @@ def enRule(self, currentIndexIndex, currentIndexRange): - ############################################################################## - # Create Rules dim0 / dim1 - ############################################################################## - def createRules01(self, problemSizeSummation ): - - diagonalRules = self.createRulesDiagonal(problemSizeSummation) - - - ############################################################################## - # Create Rules Diagonal - ############################################################################## - def createRulesDiagonal(self, problemSizeSummation): - thresholdForDiagonality = 1.5 # slightly fewer problems than 2 - numProblemSizesFastestDiagonal = 16 - problemIndices = [0]*self.numIndices - for i in self.rangeIndicesSummation: - problemIndices[i] = problemSizeSummation[i \ - - self.problemType["NumIndicesC"]] - print2("\nDiagonalRules for %s" % problemIndices) - problemSizes = [0]*self.numIndices - totalFlopsPerSizeFree = self.flopsPerMac - for i in self.rangeIndicesSummation: - totalFlopsPerSizeFree *= self.problemIndexToSize[i][problemIndices[i]] - print "totalFlopsPerSizeFree", totalFlopsPerSizeFree - - ######################################## - # transform data into serial list of "diagonal problem sizes" - diagonalData = [] - moreProblems = True - while moreProblems: - - # size free - for i in range(0, self.numIndices): - problemSizes[i] = self.problemIndexToSize[i][problemIndices[i]] - size0 = problemSizes[self.idx0] - size1 = problemSizes[self.idx1] - - # if diagonal - if size0 < size1*thresholdForDiagonality \ - and size1 < size0*thresholdForDiagonality: - sizeFree = self.getSizeFree(problemIndices) - - problemIdx = self.indicesToSerial(0, problemIndices) - solutionGFlops = [] - for i in range(0, self.numSolutions): - solutionGFlops.append(self.data[problemIdx+i]) - - diagonalData.append([ sizeFree, solutionGFlops ]) - - # next problem - problemIndices[0] += 1 - for i in self.rangeIndicesFree: - if problemIndices[i] >= self.numProblemSizes[i]: - if i == self.problemType["NumIndicesFree"]-1: - moreProblems = False - break - else: - problemIndices[i] = 0 - problemIndices[i+1] += 1 - else: - break - - diagonalData.sort(key=lambda x: x[0], reverse=True) - for dd in diagonalData: - print "DD[%u]: %s" % (dd[0], dd[1]) - print len(diagonalData) - - - ######################################## - # create first rule - sizeFree = diagonalData[0][0] - relativeTime = [0]*self.numSolutions - for i in range(0, numProblemSizesFastestDiagonal): - for j in range(0, self.numSolutions): - gflops = diagonalData[i][1][j] - relativeTime[j] += 1 / gflops - winnerIdx = 0 - winnerRelativeTime = relativeTime[0] - for i in range(1, self.numSolutions): - if relativeTime[i] < winnerRelativeTime: - winnerIdx = i - winnerRelativeTime = relativeTime[i] - print "FastestDiagonalSolution:", winnerIdx, self.solutionNames[winnerIdx] - fastestGFlops = 0 - for i in range(0, numProblemSizesFastestDiagonal): - gflops = diagonalData[i][1][winnerIdx] - if gflops > fastestGFlops: - fastestGFlops = gflops - - rules = [] - # minGFlops maxGFlops oldGFlops? - rules.append([winnerIdx, sizeFree, fastestGFlops, fastestGFlops, -1]) - print "Winner[%3u]: %u" % (0, winnerIdx) -# we can't just pay attention to single winner -# we need to compute scores for all solutions over a window -# b/c 441115111333 -# = 441555555333 -# -# we can do a smoothing pass to get rid of bogus data; if a data point is more than x% slower than 4 surrounding points, than its bogus, just set it equal to average of 4 surrounding points -# - - ######################################## - # create subsequent rules for smaller sizes - for diagonalDataIdx in range(1, len(diagonalData)): - print "DiagonalDataIdx:", diagonalDataIdx - # prior rule - priorRule = rules[len(rules)-1] - priorWinnerIdx = priorRule[0] - # candidate winner - candidateWinnerIdx = 0 - candidateWinnerGFlops = diagonalData[diagonalDataIdx][1][0] - for j in range(1, self.numSolutions): - gflops = diagonalData[diagonalDataIdx][1][j] - if gflops > candidateWinnerGFlops: - candidateWinnerIdx = j - candidateWinnerGFlops = gflops - if candidateWinnerIdx == priorWinnerIdx: - # update prior rule to include this sizeFree - rules[len(rules)-1][1] = diagonalData[diagonalDataIdx][0] # size free - rules[len(rules)-1][2] = \ - diagonalData[diagonalDataIdx][1][priorWinnerIdx] # perf at size - continue - else: - # candidate rule - sizeFree = diagonalData[diagonalDataIdx][0] - totalFlops = sizeFree*totalFlopsPerSizeFree - candidateGFlops = diagonalData[diagonalDataIdx][1][candidateWinnerIdx] - priorGFlops = diagonalData[diagonalDataIdx][1][priorWinnerIdx] - candidateRule = [ candidateWinnerIdx, sizeFree, candidateGFlops, \ - candidateGFlops, -1 ] - # candidate and prior scores - candidateTimeUs = totalFlops / candidateGFlops / 1000 - priorTimeUs = totalFlops / priorGFlops / 1000 - candidateScore = 1*self.w2 + candidateTimeUs - priorScore = 0*self.w2 + priorTimeUs - print "DDI[%3u] Prior[%2u]: %.0fus vs Candi[%2u]: %.0fus" \ - % (diagonalDataIdx, priorWinnerIdx, priorScore, candidateWinnerIdx, candidateScore) - checkMoreProblems = True - for newDiagonalDataIdx in range(diagonalDataIdx+1, len(diagonalData)): - newWinnerIdx = 0 - newWinnerGFlops = diagonalData[newDiagonalDataIdx][1][0] - for j in range(1, self.numSolutions): - gflops = diagonalData[newDiagonalDataIdx][1][j] - if gflops > newWinnerGFlops: - newWinnerIdx = j - newWinnerGFlops = gflops - # update candidate and prior scores - sizeFree = diagonalData[newDiagonalDataIdx][0] - totalFlops = sizeFree*totalFlopsPerSizeFree - candidateGFlops = \ - diagonalData[newDiagonalDataIdx][1][candidateWinnerIdx] - priorGFlops = diagonalData[newDiagonalDataIdx][1][priorWinnerIdx] - candidateTimeUs = totalFlops / candidateGFlops / 1000 - priorTimeUs = totalFlops / priorGFlops / 1000 - candidateScore += candidateTimeUs - priorScore += priorTimeUs - print " NDDI[%3u] Prior[%2u]: %.0fus vs Candi[%2u]: %.0fus" \ - % (newDiagonalDataIdx, priorWinnerIdx, priorScore, \ - candidateWinnerIdx, candidateScore) - if newWinnerIdx == candidateWinnerIdx: - print " newWinnerIdx == candidateWinnerIdx" - if candidateScore < priorScore: - # candidate rule accepted - rules.append(candidateRule) - print " accepting" - break - else: - # candidate rule not yet accepted - candidateRule[1] = sizeFree - candidateRule[2] = candidateGFlops - print " continuing" - continue - elif newWinnerIdx == priorWinnerIdx: - print " newWinnerIdx == priorWinnerIdx" - # returned to original winner, decide now to accept/reject - if candidateScore < priorScore: - # candidate rule accepted - rules.append(candidateRule) - print " accepting" - break - else: - # candidate rule rejected; update prior, continue at newSize - rules[len(rules)-1][1] = sizeFree - rules[len(rules)-1][2] = priorGFlops - diagonalDataIdx = newDiagonalDataIdx - print " rejecting" - break - else: - print " newWinnerIdx is %u" % newWinnerIdx - # new winner was a 3rd solution; decide now (same as above) - if candidateScore < priorScore: - # candidate rule accepted - rules.append(candidateRule) - print " accepting" - break - else: - # candidate rule rejected; update prior, continue at newSize - rules[len(rules)-1][1] = diagonalData[newDiagonalDataIdx][0] - rules[len(rules)-1][2] = \ - diagonalData[newDiagonalDataIdx][1][priorWinnerIdx] - diagonalDataIdx = newDiagonalDataIdx - print " rejecting" - break - - return - - # go farther forward, does candidate rule keep winning, or does priorRule keep winning? - # the new rule should start at a loss b/c of Weight2 - # a few problems in the future - # if new rule is better, W2 gets amortized, Wt improves - # if new rule is worse, W2 gets amortized, Wt worsens - # continue to future problems until, and make final decision - # newRule gets better score; accept - # return to priorRule winner; accept/reject - # Yet a new winner - # easy: make final accept/reject including this new problem size - # hard: recure? - # - # is the num problems in future vary with W2,Wt? -# Wt = 1 -# W2 = 1 means we would rather lose 1us per kernel rather than adding another split (actually they're equal) -# so, in order for candidate to be accepted immediately, it must improve all kernels by more than 1us, or after 2 sizes, improve by 0.5us per kernel -# -# -# 0 0 1 0 0 -# 0 0 1 1 0 -# 0 0 1 4 0 -# 0 0 1 4 1 0 -# - - print "Winner[%3u]: %u" % (i, winnerIdx) - - - return - - - - - - - - - - - - - - - - - - # abstract to multidimensions - # what is the diagonal - dilation = self.self.parameters["Dilation"] - threshold = self.self.parameters["Threshold"] - numProblems0 = self.numProblemSizes[self.idx0] - - ############################################################################ - # determine winner at largest size - solutionNumWins = [0]*self.numSolutions - solutionGFlops = [0]*self.numSolutions - for problemSizeIdx in range(max(0,numProblems0-dilation*2), numProblems0): - problemIndices[self.idx0] = problemSizeIdx - problemIndices[self.idx1] = problemSizeIdx - problemIdx = self.indicesToSerial(0, problemIndices) - winnerIdx = -1 - winnerGFlops = -1 - for solutionIdx in range(0, self.numSolutions): - solutionSerialIdx = problemIdx + solutionIdx - solutionTmpGFlops = self.data[solutionSerialIdx] - if solutionTmpGFlops > winnerGFlops: - winnerIdx = solutionIdx - winnerGFlops = solutionTmpGFlops - #print "updated winner: ", winnerIdx - #print winnerIdx - solutionNumWins[winnerIdx] += 1 - if winnerGFlops > solutionGFlops[winnerIdx]: - solutionGFlops[winnerIdx] = winnerGFlops - largestWinnerIdx = -1 - largestWinnerNumWins = -1 - largestWinnerGFlops = -1 - #print "FastestWins:" - for i in range(0, self.numSolutions): - #print "sol[%u] = %u wins @ %.0f GFlops" \ - # % (i, solutionNumWins[i], solutionGFlops[i]) - if solutionNumWins[i] > largestWinnerNumWins: - largestWinnerIdx = i - largestWinnerNumWins = solutionNumWins[i] - largestWinnerGFlops = solutionGFlops[i] - #print "Winner at Largest Problem: S[%u] @ %.0f GFlops with %u/%u wins" \ - # % (largestWinnerIdx, largestWinnerGFlops, largestWinnerNumWins, \ - # dilation*2) - problemIndices[self.idx0] = numProblems0-1 - problemIndices[self.idx1] = numProblems0-1 - largestWinnerAtLargestProblemIdx = self.indicesToSerial(largestWinnerIdx, \ - problemIndices) - largestWinnerGFlopsAtLargestSize = \ - self.data[largestWinnerAtLargestProblemIdx] - - ############################################################################ - # Diagonal Rule - # solutionIdx, minSizeThresholdIdx, gflops at minSize, maxGFlops, oldGFlops - numRules = 1 - diagonalRules = [ [largestWinnerIdx, deepcopy(problemIndices), \ - largestWinnerGFlopsAtLargestSize, largestWinnerGFlops, -1] ] - - ############################################################################ - # For largest to smallest, determine fastest solution - for problemSizeIdx in range(numProblems0-2, -1, -1): - problemIndices[self.idx0] = problemSizeIdx - problemIndices[self.idx1] = problemSizeIdx - problemIdx = self.indicesToSerial(0, problemIndices) - - # current rule winner performance at this problemSizeIdx - ruleWinnerIdx = diagonalRules[-1][0] - ruleWinnerGFlopsForSize = self.data[problemIdx + ruleWinnerIdx] - - #determine fastest at this problemSizeIdx - (winnerForSizeIdx, winnerForSizeGFlops) = \ - self.getWinnerForProblem( problemIndices ) - - # ruleWinner also wins at this problem size (at least by threshold) - if winnerForSizeIdx == ruleWinnerIdx \ - or ruleWinnerGFlopsForSize > (1-threshold)*winnerForSizeGFlops: - # just update rule - diagonalRules[numRules-1][1] = deepcopy(problemIndices) - diagonalRules[numRules-1][2] = ruleWinnerGFlopsForSize - diagonalRules[numRules-1][3] = max(diagonalRules[numRules-1][3], \ - ruleWinnerGFlopsForSize) - - # we have a new candidate winner - # only keep it if don't revert back to ruleWinner over next Dilation - else: - - # check if we don't revert back to ruleWinner over next Dilation probs - revert = False - endDilationIdx = max(-1, problemSizeIdx-dilation) - for dilationSizeIdx in range(problemSizeIdx-1, \ - endDilationIdx, -1): - problemIndices[self.idx0] = dilationSizeIdx - problemIndices[self.idx1] = dilationSizeIdx - dilationIdx = self.indicesToSerial(0, problemIndices) - ruleWinnerGFlopsForDilation = self.data[dilationIdx \ - + ruleWinnerIdx] - #determine fastest at this problemSizeIdx - (winnerForDilationIdx, winnerForDilationGFlops) = \ - self.getWinnerForProblem(problemIndices) - - # ruleWinner also wins at dilation size (at least by threshold) - if winnerForDilationIdx == ruleWinnerIdx \ - or ruleWinnerGFlopsForDilation \ - > (1-threshold)*winnerForSizeGFlops: - # yes, within Dilation, we've returned to same winner - revert = True - # so update rule for this size - diagonalRules[numRules-1][1] = deepcopy(problemIndices) - diagonalRules[numRules-1][2] = winnerForDilationGFlops - diagonalRules[numRules-1][3] = max(diagonalRules[numRules-1][3], \ - winnerForSizeGFlops) - # resume outer loop after dilation - problemSizeIdx = dilationSizeIdx - break - else: - # different winner at this dilation size - # don't need to do anything - pass - - # if we never revert to rule during dilation, create new rule - if not revert: - # solutionIdx, minSizeThresholdIdx, gflops at minSize, maxGFlops, old - newRule = [ winnerForSizeIdx, deepcopy(problemIndices), \ - winnerForSizeGFlops, winnerForSizeGFlops, ruleWinnerGFlopsForSize] - diagonalRules.append(newRule) - numRules += 1 - #print "Added new rule: %s" % newRule - - return diagonalRules - #end diagonal rules - - - ############################################################################## - # Skinny Solutions - ############################################################################## - def getSkinnySolutions(self, diagonalRules, problemIndices, \ - idxLarge, idxSmall): - idx0 = self.idx0 - idx1 = self.idx1 - #idxU = self.idxU - #dilation = self.self.parameters["Dilation"] - threshold = self.self.parameters["Threshold"] - - skinnyRules = [] - - # for each size threshold along diagonal - for diagonalRuleIdx in range(0, len(diagonalRules)): - diagonalRule = diagonalRules[diagonalRuleIdx] - diagonalRuleWinnerIdx = diagonalRule[0] - diagonalRuleThresholdProblem = diagonalRule[1] - #diagonalRuleGFlops = diagonalRule[2] # perf at threshold - thresholdSizeFree = self.getSizeFree(diagonalRuleThresholdProblem) - print2("ThresholdSizeFree[%u][%u]: %u" \ - % (diagonalRuleThresholdProblem[idx0], \ - diagonalRuleThresholdProblem[idx1], \ - thresholdSizeFree)) - - # check skinny d0< thresholdSizeFree: - #print "SkinnySizeFree[%u][%u]: %u" % (sizeIdxSmall, sizeIdxLarge, \ - # skinnySizeFree) - - # rule winner's performance at this skinnyness - skinnyProblemIdx = self.indicesToSerial(0, skinnyProblemIndices) - diagonalWinnerGFlopsForSkinny = self.data[skinnyProblemIdx \ - + diagonalRuleWinnerIdx] - - # which solution wins here? - (winnerIdx, winnerGFlops) = \ - self.getWinnerForProblem(skinnyProblemIndices) - #print winnerIdx, winnerGFlops - if winnerIdx == diagonalRuleWinnerIdx \ - or diagonalWinnerGFlopsForSkinny > (1-threshold)*winnerGFlops: - # diagonal rule also wins here - print2("if dS <%5u and dL >%5u diagnl S[%2u] %5.0f == S[%2u] %5.0f GFlops" \ - % (self.problemIndexToSize[idxSmall][sizeIdxSmall], \ - self.problemIndexToSize[idxLarge][sizeIdxLarge], \ - winnerIdx, winnerGFlops, diagonalRuleWinnerIdx, \ - diagonalWinnerGFlopsForSkinny )) - pass - else: - # we're so skinny that diagonal rule no longer applies - print2("if dS <%5u and dL >%5u skinny S[%2u] %5.0f >> S[%2u] %5.0f GFlops" \ - % (self.problemIndexToSize[idxSmall][sizeIdxSmall], \ - self.problemIndexToSize[idxLarge][sizeIdxLarge], \ - winnerIdx, winnerGFlops, diagonalRuleWinnerIdx, \ - diagonalWinnerGFlopsForSkinny )) - skinnyRule = [deepcopy(skinnyProblemIndices), winnerIdx, \ - winnerGFlops] - skinnyRules.append(skinnyRule) - # TODO need to use dilate parameter to make sure we've switched - # TODO data along this size may not agree with - # data along different sizes (but perhaps it should - # TODO need extra loop here, to iterate idxSmall to - # smaller sizes to see if the solution changes further - - # does the diagonalRuleWinner also win here? - break # only check the problem size closest to ruleSize - - return skinnyRules - # end skinny solutions - - - ############################################################################## - # Determine Logic Along U - ############################################################################## - def determineLogicAlongU(self): - globalRange = [] - for i in range(0, self.numIndices): - globalRange.append( [0, self.numProblemSizes[i]] ) - - - - - self.print2D([0, 0]) - - ############################################################################ - # Determine Solutions Along Diagonal - # roughly same splitting regardless of sizeU - problemIndices = [] - for numProblemsForIndex in self.numProblemSizes: - problemIndices.append(numProblemsForIndex-1) - print problemIndices - self.diagonalRules = self.getFastestSolutionsAlongDiagonal(problemIndices) - if True: - print2("Diagonal Rules:") - for rule in self.diagonalRules: - string = " if freeSize >=%4u" % self.problemIndexToSize[0][rule[1][0]] - for i in range(1, self.numIndices): - string += "x%4u" % self.problemIndexToSize[i][rule[1][i]] - string += " return S[%u] @ %5.0f-%5.0f>%5.0f GFlops is %s" \ - % (rule[0], rule[2], rule[3], rule[4], \ - self.solutionNames[rule[0]]) - print2(string) - - ############################################################################ - # Determine Skinny0 Solutions - skinnyRules01 = self.getSkinnySolutions(self.diagonalRules, problemIndices, \ - self.idx0, self.idx1) - #print "Skinny Rules:" - #for rule in skinnyRules01: - # string = " if freeSize >=%4u" % data.problemIndexToSize[0][rule[1][0]] - # for i in range(1, data.numIndices): - # string += "x%4u" % data.problemIndexToSize[i][rule[1][i]] - # string += " return S[%u] @ %5.0f-%5.0f>%5.0f GFlops is %s" \ - # % (rule[0], rule[2], rule[3], rule[4], \ - # data.solutionNames[rule[0]]) - - ############################################################################ - # Determine Skinny1 Solutions - skinnyRules10 = self.getSkinnySolutions(self.diagonalRules, problemIndices, \ - self.idx1, self.idx0) - - # list solutions that actually get used - solutionIndicesUsed = [] - for rule in skinnyRules01: - pass - for rule in skinnyRules10: - pass - for rule in self.diagonalRules: - solutionIdx = rule[0] - solution = self.solutions[solutionIdx] - MT0 = solution["MacroTile0"] - MT1 = solution["MacroTile1"] - DU = solution["DepthU"] - #print "Rule Tile S[%u]: %ux%ux%u" % (solutionIdx, MT0, MT1, DU) - # is this solution in the list - inList = False - for solutionUsed in solutionIndicesUsed: - if solutionUsed[0] == solutionIdx: - inList = True - break - if not inList: - insertIdx = len(solutionIndicesUsed) - for i in range(0, len(solutionIndicesUsed)): - iMT0 = solutionIndicesUsed[i][1] - iMT1 = solutionIndicesUsed[i][2] - iDU = solutionIndicesUsed[i][3] - #print " compare S[%u]: %ux%ux%u" % (solutionIndicesUsed[i][0], \ - # iMT0, iMT1, iDU) - if MT0*MT1 < iMT0*iMT1: - insertIdx = i - break - elif MT0*MT1 > iMT0*iMT1: - continue - else: # MT == MT - if DU < iDU: - insertIdx = i - break - else: - continue - - # if i'm smaller than i, insert me before i - #print "insert: %u" % insertIdx - solutionIndicesUsed.insert(insertIdx, [solutionIdx, MT0, MT1, DU]) - #print solutionIndicesUsed - - # list of solutions used - self.solutionsUsed = [] - for solutionIndexUsed in solutionIndicesUsed: - self.solutionsUsed.append(self.solutions[solutionIndexUsed[0]]) - - # translate rules to new solution indices - for rule in skinnyRules01: - pass - for rule in skinnyRules10: - pass - for ruleIdx in range(0, len(self.diagonalRules)): - solutionIdx = self.diagonalRules[ruleIdx][0] - for i in range(0, len(solutionIndicesUsed)): - solutionIndexUsed = solutionIndicesUsed[i] - if solutionIdx == solutionIndexUsed[0]: - self.diagonalRules[ruleIdx][0] = i - break - # change problemSizeIndices to sizes - for i in range(0, 3): - self.diagonalRules[ruleIdx][1][i] = \ - self.problemIndexToSize[i][ self.diagonalRules[ruleIdx][1][i] ] - - print2("# New Rules: %s" % self.diagonalRules) - - ############################################################################## ############################################################################## @@ -1078,22 +474,22 @@ def print2D(self, indices ): sss[sIdx] += "%4u" % self.problemIndexToSize[0][i] for j in range(0, self.numProblemSizes[1]): problemIndices[self.idx1] = j - problemIdx = self.indicesToSerial(0, problemIndices) + problemSerial = self.indicesToSerial(0, problemIndices) for sIdx in range(0, self.numSolutions): - sss[sIdx] += ",%f" % self.data[problemIdx+sIdx] + sss[sIdx] += ",%f" % self.data[problemSerial+sIdx] - if self.data[problemIdx+0] > self.data[problemIdx+1]: + if self.data[problemSerial+0] > self.data[problemSerial+1]: winnerIdx = 0 - winnerGFlops = self.data[problemIdx+0] + winnerGFlops = self.data[problemSerial+0] secondIdx = 1 - secondGFlops = self.data[problemIdx+1] + secondGFlops = self.data[problemSerial+1] else: winnerIdx = 1 - winnerGFlops = self.data[problemIdx+1] + winnerGFlops = self.data[problemSerial+1] secondIdx = 0 - secondGFlops = self.data[problemIdx+0] + secondGFlops = self.data[problemSerial+0] for solutionIdx in range(2, self.numSolutions): - solutionSerialIdx = problemIdx + solutionIdx + solutionSerialIdx = problemSerial + solutionIdx solutionGFlops = self.data[solutionSerialIdx] if solutionGFlops > winnerGFlops: #print "%f > %f" % (solutionGFlops, winnerGFlops) @@ -1157,20 +553,20 @@ def leastImportantSolution(self): for size in problemSizes: totalFlops *= size - problemIdx = self.indicesToSerial(0, problemIndices) - if self.data[problemIdx+0] > self.data[problemIdx+1]: + problemSerial = self.indicesToSerial(0, problemIndices) + if self.data[problemSerial+0] > self.data[problemSerial+1]: winnerIdx = 0 - winnerGFlops = self.data[problemIdx+0] + winnerGFlops = self.data[problemSerial+0] secondIdx = 1 - secondGFlops = self.data[problemIdx+1] + secondGFlops = self.data[problemSerial+1] else: winnerIdx = 1 - winnerGFlops = self.data[problemIdx+1] + winnerGFlops = self.data[problemSerial+1] secondIdx = 0 - secondGFlops = self.data[problemIdx+0] + secondGFlops = self.data[problemSerial+0] for solutionIdx in range(2, self.numSolutions): - solutionSerialIdx = problemIdx + solutionIdx + solutionSerialIdx = problemSerial + solutionIdx solutionGFlops = self.data[solutionSerialIdx] if solutionGFlops > winnerGFlops: secondIdx = winnerIdx @@ -1194,9 +590,29 @@ def leastImportantSolution(self): ############################################################################## - # Score Range For Logic - def scoreRangeForLogic(self, indexRange, logic): - pass + # Get Winner For Problem + def getWinnerForProblem(self, problemIndices): + problemSerial = self.indicesToSerial(0, problemIndices) + winnerIdx = -1 + winnerGFlops = -1 + for solutionIdx in range(0, self.numSolutions): + solutionSerialIdx = problemSerial + solutionIdx + solutionGFlops = self.data[solutionSerialIdx] + if solutionGFlops > winnerGFlops: + #print "%f > %f" % (solutionGFlops, winnerGFlops) + winnerIdx = solutionIdx + winnerGFlops = solutionGFlops + return (winnerIdx, winnerGFlops) + + ############################################################################## + # Winner For Range + def winnerForRange(self, indexRange): + scores = self.scoreRangeForSolutions(indexRange) + winnerIdx = 0 + for solutionIdx in range(1, self.numSolutions): + if scores[solutionIdx] < scores[winnerIdx]: + winnerIdx = solutionIdx + return winnerIdx ############################################################################## # Score (microseconds) Range For Solutions @@ -1211,6 +627,89 @@ def scoreRangeForSolutions(self, indexRange): scores[solutionIdx] += timeUs return scores + ############################################################################## + # Score Range For Logic + def scoreRangeForLogic(self, indexRange, logic): + print "ScoreRangeForLogic", indexRange, logic + depth = self.getLogicDepth([logic]) + depth = self.numIndices - depth + #obj = logic + #while isinstance(obj[0], list): + # obj = obj[0][1] + # depth -= 1 + print "Depth:", depth + fullLogic = deepcopy(logic) + for i in range(0, depth): + #print "Logic:", fullLogic + fullLogic = [-1, [fullLogic]] + fullLogic = [fullLogic] + #print "FullLogic:", fullLogic + return self.scoreRangeForFullLogic(indexRange, fullLogic) + + ############################################################################## + # Score Range For Full Logic + def scoreRangeForFullLogic(self, indexRange, logic): + print "ScoreRangeForFullLogic", indexRange, logic + score = 0 + for problemIndices in self.problemIndicesForRange(indexRange): + problemSerial = self.indicesToSerial(0, problemIndices) + totalFlops = self.totalFlopsForProblemIndices(problemIndices) + solutionIdx = self.getSolutionForProblemIndicesUsingLogic( \ + problemIndices, logic) + gflops = self.data[problemSerial + solutionIdx] + timeUs = totalFlops / gflops / 1000 + score += timeUs + logicComplexity = [0]*self.numIndices + self.scoreLogicComplexity(logic, logicComplexity) + score += self.parameters["BranchWeight"] * sum(logicComplexity) + print "LogicComplexity:", logicComplexity + return score + + ############################################################################## + # Get Solution For Problem Indices Using Logic + def getSolutionForProblemIndicesUsingLogic(self, problemIndices, logic): + currentProblemIndices = problemIndices + currentLogic = logic + for i in range(0, self.numIndices): + #print "CurrentLogic[%u]: %s" % (i, currentLogic) + currentSizeIndex = currentProblemIndices[0] + for j in range(0, len(currentLogic)): + if currentLogic[j][0] < 0: + currentProblemIndices = currentProblemIndices[1:] + currentLogic = currentLogic[j][1] + break + if currentLogic[j][0] >= 0: + if currentSizeIndex <= currentLogic[j][0]: + currentProblemIndices = currentProblemIndices[1:] + currentLogic = currentLogic[j][1] + break + #print "CurrentLogic[%u]: %s" % (i, currentLogic) + return currentLogic + + ############################################################################## + # Score Logic Complexity + def scoreLogicComplexity(self, logic, logicComplexity): + print "ScoreLogicComplexity: %s" % (logic) + depth = self.getLogicDepth(logic) + depth = self.numIndices - depth + if depth == 0: return + #print "[%u]ScoreLogicComplexity: %s" % (depth, logic) + currentLogic = logic + for i in range(0, len(logic)): + logicComplexity[depth] += 1 + self.scoreLogicComplexity(logic[i][1], logicComplexity) + + + ############################################################################## + # Get Logic Depth + def getLogicDepth(self, logic): + obj = logic + depth = 0 + while isinstance(obj, list): + obj = obj[0][1] + depth += 1 + return depth + ############################################################################## # Total Flops For Problem Indices def totalFlopsForProblemIndices(self, problemIndices): @@ -1247,12 +746,12 @@ def removeSolution(self, removeSolutionIdx): # update data self.totalSize = self.totalProblems * self.numSolutions self.data = array.array('f', [0]*self.totalSize) - for problemIdx in range(0, self.totalProblems): + for problemIndex in range(0, self.totalProblems): newSolutionIdx = 0 for oldSolutionIdx in range(0, oldNumSolutions): if oldSolutionIdx != removeSolutionIdx: - self.data[problemIdx*self.numSolutions+newSolutionIdx] \ - = oldData[problemIdx*oldNumSolutions+oldSolutionIdx] + self.data[problemIndex*self.numSolutions+newSolutionIdx] \ + = oldData[problemIndex*oldNumSolutions+oldSolutionIdx] newSolutionIdx += 1 ############################################################################## @@ -1314,20 +813,6 @@ def problemIndicesForRange(self, indexRange): break return problemIndexList - ############################################################################## - # Get Winner For Problem - def getWinnerForProblem(self, problemIndices): - problemIdx = self.indicesToSerial(0, problemIndices) - winnerIdx = -1 - winnerGFlops = -1 - for solutionIdx in range(0, self.numSolutions): - solutionSerialIdx = problemIdx + solutionIdx - solutionGFlops = self.data[solutionSerialIdx] - if solutionGFlops > winnerGFlops: - #print "%f > %f" % (solutionGFlops, winnerGFlops) - winnerIdx = solutionIdx - winnerGFlops = solutionGFlops - return (winnerIdx, winnerGFlops) ############################################################################## From dab3b3c542cbaef459c96effa442a81fd5c7e79b Mon Sep 17 00:00:00 2001 From: David Tanner Date: Thu, 2 Mar 2017 11:41:08 -0600 Subject: [PATCH 14/21] logic complexity calculation working --- Tensile/LibraryLogic.py | 424 ++++++++++++++++++++++--------------- Tensile/SolutionStructs.py | 2 +- 2 files changed, 253 insertions(+), 173 deletions(-) diff --git a/Tensile/LibraryLogic.py b/Tensile/LibraryLogic.py index 9dfeba30b..ae5b810e9 100644 --- a/Tensile/LibraryLogic.py +++ b/Tensile/LibraryLogic.py @@ -38,30 +38,39 @@ def analyzeProblemType( problemTypeTuple, inputParameters ): ###################################### # Read Data From CSV - logic = LogicAnalyzer(problemType, problemSizes, solutions, inputParameters) - logic.populateFromCSV(dataFileName) + logicAnalyzer = LogicAnalyzer( \ + problemType, problemSizes, solutions, inputParameters) + logicAnalyzer.populateFromCSV(dataFileName) ###################################### # Remove invalid solutions - logic.removeInvalidSolutions() + logicAnalyzer.removeInvalidSolutions() ###################################### # Remove least important solutions - logic.removeLeastImportantSolutions() + logicAnalyzer.removeLeastImportantSolutions() ###################################### # Correct outliers - # logic.smooth() - logic.print2D([0, 0]) + # logicAnalyzer.smooth() + logicAnalyzer.print2D([0, 0]) ###################################### # Create Rules - logic.enRule(0, logic.globalIndexRange) + logic = logicAnalyzer.enRule(0, logicAnalyzer.globalIndexRange) + print "Final Logic:" + print logic + logicComplexity = [0]*logicAnalyzer.numIndices + logicAnalyzer.scoreLogicComplexity(logic, logicComplexity) + print "Logic Complexity:", logicComplexity + score = logicAnalyzer.scoreRangeForLogic( \ + logicAnalyzer.globalIndexRange, logic) + print "Global Score:", score #return (skinnyRules01, skinnyRules10, diagonalRules) - #return (problemType, logic.solutionsUsed, [], [], logic.diagonalRules ) + #return (problemType, logicAnalyzer.solutionsUsed, [], [], logicAnalyzer.diagonalRules ) return (problemType, [], [], [], [] ) @@ -173,6 +182,7 @@ def __init__(self, problemType, problemSizes, solutions, inputParameters): self.globalIndexRange.append([0, self.numProblemSizes[i]]) self.problemIndicesForGlobalRange \ = self.problemIndicesForRange(self.globalIndexRange) + self.tab = [""]*self.numIndices @@ -334,76 +344,135 @@ def smooth(self): # ############################################################################## def enRule(self, currentIndexIndex, currentIndexRange): - tab = "" - for i in range(0, currentIndexIndex): - tab += " " - print "%senRule(%u, %s)" % (tab, currentIndexIndex, currentIndexRange) + cii = currentIndexIndex + if currentIndexIndex == 0: + self.tab[cii] = "| " + elif currentIndexIndex == 1: + self.tab[cii] = "[%2u]-| " % ( \ + currentIndexRange[self.indexOrder[0]][0]) + elif currentIndexIndex == 2: + self.tab[cii] = "[%2u,%2u]--| " % ( \ + currentIndexRange[self.indexOrder[0]][0], \ + currentIndexRange[self.indexOrder[1]][0]) + elif currentIndexIndex == 3: + self.tab[cii] = "[%2u,%2u,%2u]---| " % ( \ + currentIndexRange[self.indexOrder[0]][0], \ + currentIndexRange[self.indexOrder[1]][0], \ + currentIndexRange[self.indexOrder[2]][0]) + elif currentIndexIndex == 4: + self.tab[cii] = "[%2u,%2u,%2u,%2u]---| " % ( \ + currentIndexRange[self.indexOrder[0]][0], \ + currentIndexRange[self.indexOrder[1]][0], \ + currentIndexRange[self.indexOrder[2]][0], \ + currentIndexRange[self.indexOrder[3]][0]) + tab = self.tab[cii] currentIndex = self.indexOrder[currentIndexIndex] + print "%senRule(%s)" % (tab, currentIndexRange) nextIndexIndex = currentIndexIndex+1 nextIndexRange = deepcopy(currentIndexRange) isLastIndex = currentIndexIndex == self.numIndices-1 + ruleList = [] + ######################################## # if there's only 1 problem size here + ######################################## if currentIndexRange[currentIndex][1] \ - currentIndexRange[currentIndex][0] == 1: + ######################################## # this is last index, so just return fastest solution if isLastIndex: - # optimize b/c this should be only single problem + # TODO optimize b/c this should be only single problem #scores = self.scoreRangeForSolutions(currentIndexRange) #winnerIdx = 0 #for solutionIdx in range(1, self.numSolution): # if scores[solutionIdx] < scores[winnerIdx]: # winnerIdx = solutionIdx winnerIdx = self.winnerForRange(currentIndexRange) - print "%s returning early winner=%u" % (tab, winnerIdx) - return [ -1, winnerIdx ] + print "%sreturning early winner=%u" % (tab, winnerIdx) + ruleList.append(-1) + ruleList.append(winnerIdx) - # this isn't last index, so just return next index + ######################################## + # this isn't last index, so just recursively return next index else: - print "%s returning early enRule(%u,%s)" \ - % (tab, nextIndexIndex, nextIndexRange) - return [ -1, self.enRule(nextIndexIndex, nextIndexRange) ] - - # ruleList - ruleList = [] - - # create rule for smallest size - initialSize = min(currentIndexRange[currentIndex][0] \ - + self.parameters["InitialSolutionWindow"], \ - self.numProblemSizes[currentIndex]) - nextIndexRange[currentIndex][1] = initialSize - if isLastIndex: - winnerIdx = self.winnerForRange(nextIndexRange) - initialRule = [ currentIndexRange[currentIndex][0], winnerIdx] - + print "%sreturning early enRule(%s)" \ + % (tab, nextIndexRange) + rule = [ -1, self.enRule(nextIndexIndex, nextIndexRange) ] + ruleList.append(rule) + + ######################################## + # full iterative rule list + ######################################## else: - initialRule = [ currentIndexRange[currentIndex][0], \ - self.enRule(nextIndexIndex, nextIndexRange) ] - ruleList.append(initialRule) - - # for all problem indices in this index - - for problemIndex in range(currentIndexRange[currentIndex][0], \ - currentIndexRange[currentIndex][1]): - print "%s pIdx: %u" % (tab, problemIndex) - nextIndexRange[currentIndex][0] = problemIndex - nextIndexRange[currentIndex][1] = problemIndex+1 + ######################################## + # create initial rule + initialSize = min(currentIndexRange[currentIndex][0] \ + + self.parameters["InitialSolutionWindow"], \ + self.numProblemSizes[currentIndex]) + nextIndexRange[currentIndex][1] = initialSize if isLastIndex: - winnerIdx = self.winnerForRange(currentIndexRange) - candidateRule = [ currentIndexRange[currentIndex][0], winnerIdx] + winnerIdx = self.winnerForRange(nextIndexRange) + initialRule = [ currentIndexRange[currentIndex][0], winnerIdx] else: - candidateRule = [ problemIndex, self.enRule(nextIndexIndex, \ - nextIndexRange) ] - priorRule = ruleList[len(ruleList)-1] - priorRuleScore = self.scoreRangeForLogic(nextIndexRange, priorRule) - candidateRuleScore = self.scoreRangeForLogic(nextIndexRange, \ - candidateRule) - candidateRuleScore += self.parameters["BranchWeight"] # penalize - if candidateRuleScore < priorRuleScore: - ruleList.append(candidateRule) + print "%sinitialRule(%s)" % (tab, nextIndexRange) + initialRule = [ currentIndexRange[currentIndex][0], \ + self.enRule(nextIndexIndex, nextIndexRange) ] + print "%sinitialRule(%s) DONE" % (tab, nextIndexRange) + ruleList.append(initialRule) + + ######################################## + # for all problem indices in this index + for problemIndex in range(currentIndexRange[currentIndex][0]+1, \ + currentIndexRange[currentIndex][1]): + nextIndexRange[currentIndex][0] = problemIndex + nextIndexRange[currentIndex][1] = problemIndex+1 + priorRule = ruleList[len(ruleList)-1] + priorRuleForSize = deepcopy(priorRule) + priorRuleForSize[0] = problemIndex + + if isLastIndex: + winnerIdx = self.winnerForRange(nextIndexRange) + candidateRule = [ problemIndex, winnerIdx] + else: + candidateRule = [ problemIndex, self.enRule(nextIndexIndex, \ + nextIndexRange) ] + + ######################################## + # candidate same as prior + if candidateRule[1] == priorRule[1]: + print "%sP[%2u]: same" % (tab, problemIndex) + ruleList[len(ruleList)-1][0] = problemIndex + continue + + ######################################## + # compare candidate vs prior + else: + print "%sScoring P:%s for Prior=%s, Cand=%s" \ + % ( tab, nextIndexRange, priorRuleForSize, candidateRule) + priorRuleScore = self.scoreRangeForLogic(nextIndexRange, \ + [priorRuleForSize]) + candidateRuleScore = self.scoreRangeForLogic(nextIndexRange, \ + [candidateRule]) + candidateRuleScore += self.parameters["BranchWeight"] # penalize + candidateFaster = candidateRuleScore < priorRuleScore + print "%sP[%2u]: %s %s~%.0fus < %s~%.0fus" % (tab, problemIndex, \ + "wins" if candidateFaster else "same", \ + candidateRule, candidateRuleScore, priorRuleForSize, \ + priorRuleScore ) + + ######################################## + # candidate wins + if candidateRuleScore < priorRuleScore: + ruleList.append(candidateRule) + + ######################################## + # prior wins + else: + ruleList[len(ruleList)-1][0] = problemIndex + print "%sReturning RuleList: %s" % (tab, ruleList) return ruleList @@ -417,20 +486,9 @@ def enRule(self, currentIndexIndex, currentIndexRange): ############################################################################## - - - ############################################################################## - ############################################################################## - ### - ### Helper / Low-Level Functions - ### - ############################################################################## - ############################################################################## - - - ############################################################################## # Print2D + ############################################################################## def print2D(self, indices ): indicesIdx = 0 problemIndices = [] @@ -538,6 +596,7 @@ def print2D(self, indices ): ############################################################################## # Least Important Solution + ############################################################################## def leastImportantSolution(self): solutionImportance = [] for i in range(0, self.numSolutions): @@ -590,66 +649,70 @@ def leastImportantSolution(self): ############################################################################## - # Get Winner For Problem - def getWinnerForProblem(self, problemIndices): - problemSerial = self.indicesToSerial(0, problemIndices) - winnerIdx = -1 - winnerGFlops = -1 - for solutionIdx in range(0, self.numSolutions): - solutionSerialIdx = problemSerial + solutionIdx - solutionGFlops = self.data[solutionSerialIdx] - if solutionGFlops > winnerGFlops: - #print "%f > %f" % (solutionGFlops, winnerGFlops) - winnerIdx = solutionIdx - winnerGFlops = solutionGFlops - return (winnerIdx, winnerGFlops) - + # Remove Solution ############################################################################## - # Winner For Range - def winnerForRange(self, indexRange): - scores = self.scoreRangeForSolutions(indexRange) - winnerIdx = 0 - for solutionIdx in range(1, self.numSolutions): - if scores[solutionIdx] < scores[winnerIdx]: - winnerIdx = solutionIdx - return winnerIdx + def removeSolution(self, removeSolutionIdx): + + # temporarily move current to old + oldSolutions = self.solutions + oldNumSolutions = self.numSolutions + oldData = self.data + oldTotalSize = self.totalSize + + # update solutions + self.solutions = [] + for i in range(0, oldNumSolutions): + if i != removeSolutionIdx: + self.solutions.append(oldSolutions[i]) + self.solutionMinNaming = Solution.getMinNaming(self.solutions) + self.solutionNames = [] + self.solutionTiles = [] + for solution in self.solutions: + self.solutionNames.append(Solution.getNameMin(solution, \ + self.solutionMinNaming)) + self.solutionTiles.append("%ux%u"%(solution["MacroTile0"], \ + solution["MacroTile1"])) + self.numSolutions = len(self.solutions) + + # update data + self.totalSize = self.totalProblems * self.numSolutions + self.data = array.array('f', [0]*self.totalSize) + for problemIndex in range(0, self.totalProblems): + newSolutionIdx = 0 + for oldSolutionIdx in range(0, oldNumSolutions): + if oldSolutionIdx != removeSolutionIdx: + self.data[problemIndex*self.numSolutions+newSolutionIdx] \ + = oldData[problemIndex*oldNumSolutions+oldSolutionIdx] + newSolutionIdx += 1 - ############################################################################## - # Score (microseconds) Range For Solutions - def scoreRangeForSolutions(self, indexRange): - scores = [0]*self.numSolutions - for problemIndices in self.problemIndicesForRange(indexRange): - problemSerial = self.indicesToSerial(0, problemIndices) - totalFlops = self.totalFlopsForProblemIndices(problemIndices) - for solutionIdx in range(0, self.numSolutions): - gflops = self.data[problemSerial+solutionIdx] - timeUs = totalFlops / gflops / 1000 - scores[solutionIdx] += timeUs - return scores ############################################################################## # Score Range For Logic + ############################################################################## def scoreRangeForLogic(self, indexRange, logic): - print "ScoreRangeForLogic", indexRange, logic - depth = self.getLogicDepth([logic]) + #print "ScoreRangeForLogic", indexRange, logic + depth = self.getLogicDepth(logic) depth = self.numIndices - depth + #print "%sSRFL R=%s L=%s" % (self.tab[depth], indexRange, logic) #obj = logic #while isinstance(obj[0], list): # obj = obj[0][1] # depth -= 1 - print "Depth:", depth + #print "Depth:", depth fullLogic = deepcopy(logic) for i in range(0, depth): #print "Logic:", fullLogic - fullLogic = [-1, [fullLogic]] - fullLogic = [fullLogic] + fullLogic = [[-1, fullLogic]] + fullLogic = fullLogic #print "FullLogic:", fullLogic - return self.scoreRangeForFullLogic(indexRange, fullLogic) + return self.scoreRangeForFullLogic(depth, indexRange, fullLogic) ############################################################################## # Score Range For Full Logic - def scoreRangeForFullLogic(self, indexRange, logic): - print "ScoreRangeForFullLogic", indexRange, logic + ############################################################################## + def scoreRangeForFullLogic(self, depth, indexRange, logic): + #print "ScoreRangeForFullLogic", indexRange, logic + #print "%sSRFFL R=%s L=%s" % (self.tab[depth], indexRange, logic) score = 0 for problemIndices in self.problemIndicesForRange(indexRange): problemSerial = self.indicesToSerial(0, problemIndices) @@ -659,20 +722,25 @@ def scoreRangeForFullLogic(self, indexRange, logic): gflops = self.data[problemSerial + solutionIdx] timeUs = totalFlops / gflops / 1000 score += timeUs + #print "%sSRFFL t+=%.0f" % (self.tab[depth], timeUs) logicComplexity = [0]*self.numIndices self.scoreLogicComplexity(logic, logicComplexity) + #print "%sSRFFL Complexity=%s" % (self.tab[depth], logicComplexity) score += self.parameters["BranchWeight"] * sum(logicComplexity) - print "LogicComplexity:", logicComplexity + #print "LogicComplexity:", logicComplexity return score ############################################################################## # Get Solution For Problem Indices Using Logic + ############################################################################## def getSolutionForProblemIndicesUsingLogic(self, problemIndices, logic): - currentProblemIndices = problemIndices + #print "i:", problemIndices + currentProblemIndices = self.toIndexOrder(problemIndices) + #print "i:", currentProblemIndices currentLogic = logic for i in range(0, self.numIndices): - #print "CurrentLogic[%u]: %s" % (i, currentLogic) currentSizeIndex = currentProblemIndices[0] + #print "CurrentLogic[%u] P[%2u]: %s" % (i, currentSizeIndex, currentLogic) for j in range(0, len(currentLogic)): if currentLogic[j][0] < 0: currentProblemIndices = currentProblemIndices[1:] @@ -683,16 +751,67 @@ def getSolutionForProblemIndicesUsingLogic(self, problemIndices, logic): currentProblemIndices = currentProblemIndices[1:] currentLogic = currentLogic[j][1] break - #print "CurrentLogic[%u]: %s" % (i, currentLogic) + #print "FinalLogic[%u]: %s" % (i, currentLogic) return currentLogic + + ############################################################################## + ############################################################################## + ### + ### Helper / Low-Level Functions + ### + ############################################################################## + ############################################################################## + + + ############################################################################## + # Get Winner For Problem + def getWinnerForProblem(self, problemIndices): + problemSerial = self.indicesToSerial(0, problemIndices) + winnerIdx = -1 + winnerGFlops = -1 + for solutionIdx in range(0, self.numSolutions): + solutionSerialIdx = problemSerial + solutionIdx + solutionGFlops = self.data[solutionSerialIdx] + if solutionGFlops > winnerGFlops: + #print "%f > %f" % (solutionGFlops, winnerGFlops) + winnerIdx = solutionIdx + winnerGFlops = solutionGFlops + return (winnerIdx, winnerGFlops) + + + ############################################################################## + # Winner For Range + def winnerForRange(self, indexRange): + scores = self.scoreRangeForSolutions(indexRange) + winnerIdx = 0 + for solutionIdx in range(1, self.numSolutions): + if scores[solutionIdx] < scores[winnerIdx]: + winnerIdx = solutionIdx + return winnerIdx + + + ############################################################################## + # Score (microseconds) Range For Solutions + def scoreRangeForSolutions(self, indexRange): + scores = [0]*self.numSolutions + for problemIndices in self.problemIndicesForRange(indexRange): + problemSerial = self.indicesToSerial(0, problemIndices) + totalFlops = self.totalFlopsForProblemIndices(problemIndices) + for solutionIdx in range(0, self.numSolutions): + gflops = self.data[problemSerial+solutionIdx] + timeUs = totalFlops / gflops / 1000 + scores[solutionIdx] += timeUs + return scores + + ############################################################################## # Score Logic Complexity def scoreLogicComplexity(self, logic, logicComplexity): - print "ScoreLogicComplexity: %s" % (logic) depth = self.getLogicDepth(logic) - depth = self.numIndices - depth if depth == 0: return + depth = self.numIndices - depth + #print "ScoreLogicComplexity[%u]: %s" % (depth, logic) #print "[%u]ScoreLogicComplexity: %s" % (depth, logic) currentLogic = logic for i in range(0, len(logic)): @@ -710,6 +829,21 @@ def getLogicDepth(self, logic): depth += 1 return depth + ############################################################################## + # To Index Order + def toIndexOrder(self, problemIndices): + ordered = [] + for i in self.indexOrder: + ordered.append(problemIndices[i]) + return ordered +# serial order = 0, 1, 2, 3 +# problem indi = 9, 8, 7, 6 + +# index order = 3, 2, 0, 1 +# ordered = 6, 7, 9, 8 +# +# + ############################################################################## # Total Flops For Problem Indices def totalFlopsForProblemIndices(self, problemIndices): @@ -718,41 +852,6 @@ def totalFlopsForProblemIndices(self, problemIndices): totalFlops *= self.problemIndexToSize[i][problemIndices[i]] return totalFlops - ############################################################################## - # Remove Solution - def removeSolution(self, removeSolutionIdx): - - # temporarily move current to old - oldSolutions = self.solutions - oldNumSolutions = self.numSolutions - oldData = self.data - oldTotalSize = self.totalSize - - # update solutions - self.solutions = [] - for i in range(0, oldNumSolutions): - if i != removeSolutionIdx: - self.solutions.append(oldSolutions[i]) - self.solutionMinNaming = Solution.getMinNaming(self.solutions) - self.solutionNames = [] - self.solutionTiles = [] - for solution in self.solutions: - self.solutionNames.append(Solution.getNameMin(solution, \ - self.solutionMinNaming)) - self.solutionTiles.append("%ux%u"%(solution["MacroTile0"], \ - solution["MacroTile1"])) - self.numSolutions = len(self.solutions) - - # update data - self.totalSize = self.totalProblems * self.numSolutions - self.data = array.array('f', [0]*self.totalSize) - for problemIndex in range(0, self.totalProblems): - newSolutionIdx = 0 - for oldSolutionIdx in range(0, oldNumSolutions): - if oldSolutionIdx != removeSolutionIdx: - self.data[problemIndex*self.numSolutions+newSolutionIdx] \ - = oldData[problemIndex*oldNumSolutions+oldSolutionIdx] - newSolutionIdx += 1 ############################################################################## # Recommended Index Order @@ -767,28 +866,6 @@ def recommendedIndexOrder(self): order.append(self.idx1) return order - ############################################################################## - # Print Data - def printData(self): - print2("serial; idxD0, idxD1, idxDU, idxOthers; sizeD0, sizeD1, sizeDU, sizeOthers; sol0, sol1, sol2, ...") - indices = [0]*self.numIndices - for serial in range(0, self.totalProblems): - s = "[%4u] [%2u" % (serial, indices[0]) - for i in range(1, self.numIndices): - s += ", %2u" % indices[i] - s += "] [%4u" % self.problemIndexToSize[0][indices[0]] - for i in range(1, self.numIndices): - s += ", %4u" % self.problemIndexToSize[i][indices[i]] - s += "]: %9.3f" % self.data[serial*self.numSolutions+0] - for i in range(1, self.numSolutions): - s += ", %9.3f" % self.data[serial*self.numSolutions+i] - print2(s) - indices[0] += 1 - for i in range(1, self.numIndices): - if indices[i-1] >= self.numProblemSizes[i-1]: - indices[i-1] = 0 - indices[i] += 1 - ############################################################################## # Problem Indices For Range def problemIndicesForRange(self, indexRange): @@ -814,7 +891,6 @@ def problemIndicesForRange(self, indexRange): return problemIndexList - ############################################################################## # Get Size Free def getSizeFree(self, problemIndices): @@ -929,3 +1005,7 @@ def main( config ): schedulePrefix, logic) popWorkingPath() + +######################################## +# TODO +# - is scoring working diff --git a/Tensile/SolutionStructs.py b/Tensile/SolutionStructs.py index 795c32022..e3d66be05 100644 --- a/Tensile/SolutionStructs.py +++ b/Tensile/SolutionStructs.py @@ -124,7 +124,7 @@ def numRegisters( self ): def numBytes( self ): return self.numRegisters() * 4 def flopsPerMac(self): - return 1 if self.isReal() else 8 + return 2 if self.isReal() else 8 def __str__(self): return self.toChar() From 6d68b297709b7a414e903646702f9aab9cfd142b Mon Sep 17 00:00:00 2001 From: David Tanner Date: Thu, 2 Mar 2017 11:58:38 -0600 Subject: [PATCH 15/21] increasing branch penalty does decrease number of branches and worsen score --- Tensile/LibraryLogic.py | 18 +++++++++++++++--- 1 file changed, 15 insertions(+), 3 deletions(-) diff --git a/Tensile/LibraryLogic.py b/Tensile/LibraryLogic.py index ae5b810e9..0ab30ab9b 100644 --- a/Tensile/LibraryLogic.py +++ b/Tensile/LibraryLogic.py @@ -451,10 +451,22 @@ def enRule(self, currentIndexIndex, currentIndexRange): else: print "%sScoring P:%s for Prior=%s, Cand=%s" \ % ( tab, nextIndexRange, priorRuleForSize, candidateRule) + # score prior priorRuleScore = self.scoreRangeForLogic(nextIndexRange, \ [priorRuleForSize]) + logicComplexity = [0]*self.numIndices + self.scoreLogicComplexity( \ + [priorRuleForSize], logicComplexity) + priorRuleScore += self.parameters["BranchWeight"] \ + * sum(logicComplexity) + # score candidate candidateRuleScore = self.scoreRangeForLogic(nextIndexRange, \ [candidateRule]) + logicComplexity = [0]*self.numIndices + self.scoreLogicComplexity( \ + [candidateRule], logicComplexity) + candidateRuleScore += self.parameters["BranchWeight"] \ + * sum(logicComplexity) candidateRuleScore += self.parameters["BranchWeight"] # penalize candidateFaster = candidateRuleScore < priorRuleScore print "%sP[%2u]: %s %s~%.0fus < %s~%.0fus" % (tab, problemIndex, \ @@ -723,10 +735,10 @@ def scoreRangeForFullLogic(self, depth, indexRange, logic): timeUs = totalFlops / gflops / 1000 score += timeUs #print "%sSRFFL t+=%.0f" % (self.tab[depth], timeUs) - logicComplexity = [0]*self.numIndices - self.scoreLogicComplexity(logic, logicComplexity) + #logicComplexity = [0]*self.numIndices + #self.scoreLogicComplexity(logic, logicComplexity) #print "%sSRFFL Complexity=%s" % (self.tab[depth], logicComplexity) - score += self.parameters["BranchWeight"] * sum(logicComplexity) + #score += self.parameters["BranchWeight"] * sum(logicComplexity) #print "LogicComplexity:", logicComplexity return score From af066ce9a1bf749edf2d6c1c5e0e63fd47730868 Mon Sep 17 00:00:00 2001 From: David Tanner Date: Thu, 2 Mar 2017 16:21:13 -0600 Subject: [PATCH 16/21] writing new library code for new solution selection logic --- Tensile/ClientWriter.py | 9 ++- Tensile/LibraryLogic.py | 128 ++++++++++++-------------------- Tensile/TensileCreateLibrary.py | 79 ++++++++++++++++---- Tensile/YAMLIO.py | 56 ++++++++++---- 4 files changed, 155 insertions(+), 117 deletions(-) diff --git a/Tensile/ClientWriter.py b/Tensile/ClientWriter.py index f3289dcc8..2ee9ba083 100644 --- a/Tensile/ClientWriter.py +++ b/Tensile/ClientWriter.py @@ -1,4 +1,4 @@ -from Common import globalParameters, HR, pushWorkingPath, popWorkingPath, print1, CHeader, printExit +from Common import globalParameters, HR, pushWorkingPath, popWorkingPath, print1, CHeader, printExit, printWarning from SolutionStructs import Solution from SolutionWriter import SolutionWriter import YAMLIO @@ -54,13 +54,14 @@ def main( config ): ############################################################################## logicFiles = [os.path.join(libraryLogicPath, f) for f \ in os.listdir(libraryLogicPath) \ - if os.path.isfile(os.path.join(libraryLogicPath, f))] + if (os.path.isfile(os.path.join(libraryLogicPath, f)) \ + and os.path.splitext(f)[1]==".yaml")] print1("LogicFiles: %s" % logicFiles) functions = [] functionNames = [] for logicFileName in logicFiles: - (scheduleName, problemType, solutionsForType, skinnyLogic0, skinnyLogic1, \ - diagonalLogic) = YAMLIO.readLibraryLogicForProblemType(logicFileName) + (scheduleName, problemType, solutionsForType, indexOrder, logic) \ + = YAMLIO.readLibraryLogicForProblemType(logicFileName) functions.append((scheduleName, problemType)) functionNames.append("tensile_%s_%s" % (scheduleName, problemType)) diff --git a/Tensile/LibraryLogic.py b/Tensile/LibraryLogic.py index 0ab30ab9b..3016f1c79 100644 --- a/Tensile/LibraryLogic.py +++ b/Tensile/LibraryLogic.py @@ -2,6 +2,7 @@ import os.path import array import csv +from sys import stdout from copy import deepcopy @@ -20,9 +21,6 @@ def analyzeProblemType( problemTypeTuple, inputParameters ): print2(HR) print1("# %s" % problemType) - #print "# %s" % dataFileName - #print "# %s" % solutionsFileName - ###################################### # Read Solutions (problemSizes, solutions) = YAMLIO.readSolutions(solutionsFileName) @@ -58,20 +56,18 @@ def analyzeProblemType( problemTypeTuple, inputParameters ): ###################################### # Create Rules logic = logicAnalyzer.enRule(0, logicAnalyzer.globalIndexRange) - print "Final Logic:" - print logic + print2("# Final Logic:") + print2(logic) logicComplexity = [0]*logicAnalyzer.numIndices logicAnalyzer.scoreLogicComplexity(logic, logicComplexity) - print "Logic Complexity:", logicComplexity + print2("Logic Complexity: %s" % logicComplexity) score = logicAnalyzer.scoreRangeForLogic( \ logicAnalyzer.globalIndexRange, logic) - print "Global Score:", score - - + print1("\nScore: %.0f ms" % (score/1000)) #return (skinnyRules01, skinnyRules10, diagonalRules) #return (problemType, logicAnalyzer.solutionsUsed, [], [], logicAnalyzer.diagonalRules ) - return (problemType, [], [], [], [] ) + return (problemType, logicAnalyzer.solutions, logicAnalyzer.indexOrder, logic) @@ -80,20 +76,6 @@ def analyzeProblemType( problemTypeTuple, inputParameters ): ################################################################################ class LogicAnalyzer: - ######################################## - # diagonal rule looks like - # 0: solutionIdx - # 1: problemIndices for minThreshold problem - # 2: gflops at above minSize - # 3: maxGFlops for this solution along diagonal in interval it won - # 4: gflops of prior winner at minSize, i.e., what performance did it beat - - ######################################## - # skinny rule looks like - # 0: solutionIdx - # 1: problemIndices for minThreshold problem - # 2: gflops at above minSize - ############################################################################## ############################################################################## ### @@ -173,8 +155,6 @@ def __init__(self, problemType, problemSizes, solutions, inputParameters): self.w0 = self.parameters["Weight0"] self.w1 = self.parameters["Weight1"] self.w2 = self.parameters["Weight2"] - #print "S->I %s" % self.problemSizeToIndex - #print "I->S %s" % self.problemIndexToSize self.indexOrder = self.recommendedIndexOrder() print2("IndexOrder: %s" % self.indexOrder) self.globalIndexRange = [] @@ -224,10 +204,6 @@ def populateFromCSV(self, dataFileName): for i in range(0, self.numIndices): problemIndices.append(self.problemSizeToIndex[i][problemSize[i]]) serialIdx = self.indicesToSerial(0, problemIndices) - #print "%s -> %s -> %u" % (problemSize, problemIndices, serialIdx) - - # total size - #totalFlops = float(row[totalSizeIdx]) # data solutionIdx = 0 @@ -316,7 +292,6 @@ def smooth(self): new = sum(neighborGFlops)/len(neighborGFlops) old = self.data[problemSerial+solutionIdx] s += "%f -> %f" % (old, new) - print s self.data[problemSerial+solutionIdx] \ = sum(neighborGFlops)/len(neighborGFlops) @@ -346,28 +321,30 @@ def smooth(self): def enRule(self, currentIndexIndex, currentIndexRange): cii = currentIndexIndex if currentIndexIndex == 0: - self.tab[cii] = "| " + self.tab[cii] = "[] " elif currentIndexIndex == 1: - self.tab[cii] = "[%2u]-| " % ( \ + self.tab[cii] = "[%2u] " % ( \ currentIndexRange[self.indexOrder[0]][0]) elif currentIndexIndex == 2: - self.tab[cii] = "[%2u,%2u]--| " % ( \ + self.tab[cii] = "[%2u,%2u] " % ( \ currentIndexRange[self.indexOrder[0]][0], \ currentIndexRange[self.indexOrder[1]][0]) elif currentIndexIndex == 3: - self.tab[cii] = "[%2u,%2u,%2u]---| " % ( \ + self.tab[cii] = "[%2u,%2u,%2u] " % ( \ currentIndexRange[self.indexOrder[0]][0], \ currentIndexRange[self.indexOrder[1]][0], \ currentIndexRange[self.indexOrder[2]][0]) elif currentIndexIndex == 4: - self.tab[cii] = "[%2u,%2u,%2u,%2u]---| " % ( \ + self.tab[cii] = "[%2u,%2u,%2u,%2u] " % ( \ currentIndexRange[self.indexOrder[0]][0], \ currentIndexRange[self.indexOrder[1]][0], \ currentIndexRange[self.indexOrder[2]][0], \ currentIndexRange[self.indexOrder[3]][0]) tab = self.tab[cii] + if globalParameters["PrintLevel"] == 1: + stdout.write("\n%s"%tab) currentIndex = self.indexOrder[currentIndexIndex] - print "%senRule(%s)" % (tab, currentIndexRange) + print2("%senRule(%s)" % (tab, currentIndexRange)) nextIndexIndex = currentIndexIndex+1 nextIndexRange = deepcopy(currentIndexRange) isLastIndex = currentIndexIndex == self.numIndices-1 @@ -389,17 +366,21 @@ def enRule(self, currentIndexIndex, currentIndexRange): # if scores[solutionIdx] < scores[winnerIdx]: # winnerIdx = solutionIdx winnerIdx = self.winnerForRange(currentIndexRange) - print "%sreturning early winner=%u" % (tab, winnerIdx) + #print2("%sreturning early winner=%u" % (tab, winnerIdx)) ruleList.append(-1) ruleList.append(winnerIdx) + if globalParameters["PrintLevel"] == 1: + stdout.write("#") ######################################## # this isn't last index, so just recursively return next index else: - print "%sreturning early enRule(%s)" \ - % (tab, nextIndexRange) + #print2("%sreturning early enRule(%s)" \ + # % (tab, nextIndexRange) ) rule = [ -1, self.enRule(nextIndexIndex, nextIndexRange) ] ruleList.append(rule) + if globalParameters["PrintLevel"] == 1: + stdout.write("#") ######################################## # full iterative rule list @@ -416,11 +397,13 @@ def enRule(self, currentIndexIndex, currentIndexRange): winnerIdx = self.winnerForRange(nextIndexRange) initialRule = [ currentIndexRange[currentIndex][0], winnerIdx] else: - print "%sinitialRule(%s)" % (tab, nextIndexRange) + #print2("%sinitialRule(%s)" % (tab, nextIndexRange)) initialRule = [ currentIndexRange[currentIndex][0], \ self.enRule(nextIndexIndex, nextIndexRange) ] - print "%sinitialRule(%s) DONE" % (tab, nextIndexRange) + #print2("%sinitialRule(%s) DONE" % (tab, nextIndexRange)) ruleList.append(initialRule) + if globalParameters["PrintLevel"] == 1: + stdout.write("#") ######################################## # for all problem indices in this index @@ -442,15 +425,17 @@ def enRule(self, currentIndexIndex, currentIndexRange): ######################################## # candidate same as prior if candidateRule[1] == priorRule[1]: - print "%sP[%2u]: same" % (tab, problemIndex) + #print2("%sP[%2u]: same" % (tab, problemIndex)) ruleList[len(ruleList)-1][0] = problemIndex + if globalParameters["PrintLevel"] == 1: + stdout.write(" ") continue ######################################## # compare candidate vs prior else: - print "%sScoring P:%s for Prior=%s, Cand=%s" \ - % ( tab, nextIndexRange, priorRuleForSize, candidateRule) + #print2("%sScoring P:%s for Prior=%s, Cand=%s" \ + # % ( tab, nextIndexRange, priorRuleForSize, candidateRule)) # score prior priorRuleScore = self.scoreRangeForLogic(nextIndexRange, \ [priorRuleForSize]) @@ -469,22 +454,26 @@ def enRule(self, currentIndexIndex, currentIndexRange): * sum(logicComplexity) candidateRuleScore += self.parameters["BranchWeight"] # penalize candidateFaster = candidateRuleScore < priorRuleScore - print "%sP[%2u]: %s %s~%.0fus < %s~%.0fus" % (tab, problemIndex, \ + print2("%sP[%2u]: %s %s~%.0fus < %s~%.0fus" % (tab, problemIndex, \ "wins" if candidateFaster else "same", \ candidateRule, candidateRuleScore, priorRuleForSize, \ - priorRuleScore ) + priorRuleScore )) ######################################## # candidate wins if candidateRuleScore < priorRuleScore: ruleList.append(candidateRule) + if globalParameters["PrintLevel"] == 1: + stdout.write("#") ######################################## # prior wins else: + if globalParameters["PrintLevel"] == 1: + stdout.write(".") ruleList[len(ruleList)-1][0] = problemIndex - print "%sReturning RuleList: %s" % (tab, ruleList) + #print2("%sReturning RuleList: %s" % (tab, ruleList)) return ruleList @@ -562,7 +551,6 @@ def print2D(self, indices ): solutionSerialIdx = problemSerial + solutionIdx solutionGFlops = self.data[solutionSerialIdx] if solutionGFlops > winnerGFlops: - #print "%f > %f" % (solutionGFlops, winnerGFlops) secondIdx = winnerIdx secondGFlops = winnerGFlops winnerIdx = solutionIdx @@ -702,29 +690,18 @@ def removeSolution(self, removeSolutionIdx): # Score Range For Logic ############################################################################## def scoreRangeForLogic(self, indexRange, logic): - #print "ScoreRangeForLogic", indexRange, logic depth = self.getLogicDepth(logic) depth = self.numIndices - depth - #print "%sSRFL R=%s L=%s" % (self.tab[depth], indexRange, logic) - #obj = logic - #while isinstance(obj[0], list): - # obj = obj[0][1] - # depth -= 1 - #print "Depth:", depth fullLogic = deepcopy(logic) for i in range(0, depth): - #print "Logic:", fullLogic fullLogic = [[-1, fullLogic]] fullLogic = fullLogic - #print "FullLogic:", fullLogic return self.scoreRangeForFullLogic(depth, indexRange, fullLogic) ############################################################################## # Score Range For Full Logic ############################################################################## def scoreRangeForFullLogic(self, depth, indexRange, logic): - #print "ScoreRangeForFullLogic", indexRange, logic - #print "%sSRFFL R=%s L=%s" % (self.tab[depth], indexRange, logic) score = 0 for problemIndices in self.problemIndicesForRange(indexRange): problemSerial = self.indicesToSerial(0, problemIndices) @@ -734,25 +711,16 @@ def scoreRangeForFullLogic(self, depth, indexRange, logic): gflops = self.data[problemSerial + solutionIdx] timeUs = totalFlops / gflops / 1000 score += timeUs - #print "%sSRFFL t+=%.0f" % (self.tab[depth], timeUs) - #logicComplexity = [0]*self.numIndices - #self.scoreLogicComplexity(logic, logicComplexity) - #print "%sSRFFL Complexity=%s" % (self.tab[depth], logicComplexity) - #score += self.parameters["BranchWeight"] * sum(logicComplexity) - #print "LogicComplexity:", logicComplexity return score ############################################################################## # Get Solution For Problem Indices Using Logic ############################################################################## def getSolutionForProblemIndicesUsingLogic(self, problemIndices, logic): - #print "i:", problemIndices currentProblemIndices = self.toIndexOrder(problemIndices) - #print "i:", currentProblemIndices currentLogic = logic for i in range(0, self.numIndices): currentSizeIndex = currentProblemIndices[0] - #print "CurrentLogic[%u] P[%2u]: %s" % (i, currentSizeIndex, currentLogic) for j in range(0, len(currentLogic)): if currentLogic[j][0] < 0: currentProblemIndices = currentProblemIndices[1:] @@ -763,7 +731,6 @@ def getSolutionForProblemIndicesUsingLogic(self, problemIndices, logic): currentProblemIndices = currentProblemIndices[1:] currentLogic = currentLogic[j][1] break - #print "FinalLogic[%u]: %s" % (i, currentLogic) return currentLogic @@ -786,7 +753,6 @@ def getWinnerForProblem(self, problemIndices): solutionSerialIdx = problemSerial + solutionIdx solutionGFlops = self.data[solutionSerialIdx] if solutionGFlops > winnerGFlops: - #print "%f > %f" % (solutionGFlops, winnerGFlops) winnerIdx = solutionIdx winnerGFlops = solutionGFlops return (winnerIdx, winnerGFlops) @@ -823,8 +789,6 @@ def scoreLogicComplexity(self, logic, logicComplexity): depth = self.getLogicDepth(logic) if depth == 0: return depth = self.numIndices - depth - #print "ScoreLogicComplexity[%u]: %s" % (depth, logic) - #print "[%u]ScoreLogicComplexity: %s" % (depth, logic) currentLogic = logic for i in range(0, len(logic)): logicComplexity[depth] += 1 @@ -841,6 +805,7 @@ def getLogicDepth(self, logic): depth += 1 return depth + ############################################################################## # To Index Order def toIndexOrder(self, problemIndices): @@ -848,13 +813,7 @@ def toIndexOrder(self, problemIndices): for i in self.indexOrder: ordered.append(problemIndices[i]) return ordered -# serial order = 0, 1, 2, 3 -# problem indi = 9, 8, 7, 6 -# index order = 3, 2, 0, 1 -# ordered = 6, 7, 9, 8 -# -# ############################################################################## # Total Flops For Problem Indices @@ -1012,12 +971,17 @@ def main( config ): # Run Analysis schedulePrefix = globalParameters["Name"] for problemTypeTuple in problemTypeTuples: - logic = analyzeProblemType( problemTypeTuple, analysisParameters ) + logicTuple = analyzeProblemType( problemTypeTuple, analysisParameters ) YAMLIO.writeLibraryLogicForProblemType(globalParameters["WorkingPath"], \ - schedulePrefix, logic) + schedulePrefix, logicTuple) popWorkingPath() ######################################## # TODO -# - is scoring working +# - different weights for different levels? +# are there pairs of weights that would result in same logic complexity but better score? + +######################################## +# TODO problems which this algorithm +# - barrier to switching may not always be amortised on next step, need to calculate several steps into future to see if net win; process needs to be a search tree. 32x32 search only takes 1 second diff --git a/Tensile/TensileCreateLibrary.py b/Tensile/TensileCreateLibrary.py index ef7c13392..a17366852 100644 --- a/Tensile/TensileCreateLibrary.py +++ b/Tensile/TensileCreateLibrary.py @@ -171,9 +171,8 @@ def writeLogic(outputPath, logicList, solutionWriter ): scheduleName = logicProblemType[0] problemType = logicProblemType[1] solutions = logicProblemType[2] - skinnyLogic0 = logicProblemType[3] - skinnyLogic1 = logicProblemType[4] - diagonalLogic = logicProblemType[5] + indexOrder = logicProblemType[3] + logic = logicProblemType[4] # solution names solutionNames = [] @@ -218,14 +217,22 @@ def writeLogic(outputPath, logicList, solutionWriter ): for i in range(1, len(problemType["IndicesSummation"])): s += "*size%s" % indexChars[problemType["IndicesSummation"][i]] s += ";\n\n" - for rule in skinnyLogic0: - print2(rule) - for rule in skinnyLogic1: - print2(rule) + #for rule in skinnyLogic0: + # print2(rule) + #for rule in skinnyLogic1: + # print2(rule) print2(solutionNames) - for ruleIdx in range(0, len(diagonalLogic)): - rule = diagonalLogic[ruleIdx] + logicStr = writeLogicRec(0, indexOrder, logic, solutionNames) + print logicStr + printExit("TODO") + #for indexIndex in range(0, problemType["TotalIndices"]): + # index = indexOrder[indexIndex] + # for ruleIdx in range(0, len(logic)): + # rule = logic[ruleIdx] + + + """ print2(rule) winnerIdx = rule[0] problemSize = rule[1] @@ -267,7 +274,7 @@ def writeLogic(outputPath, logicList, solutionWriter ): s += ", size%s" % indexChars[i] s += ", stream, numInputEvents, inputEvents, outputEvent ); /* [%f,%f] GFlops*/\n" % (minGFlops,maxGFlops) - + """ s += "\n}\n" # open and close individual files @@ -289,6 +296,45 @@ def writeLogic(outputPath, logicList, solutionWriter ): logicHeaderFile.write(h) logicHeaderFile.close() +################################################################################ +# Write Logic Recursive +################################################################################ +def writeLogicRec(depth, indexOrder, logic, solutionNames): + indexChars = globalParameters["IndexChars"] + indent = " " + indent += " "*depth + s = "" + lowestLevel = depth == len(indexOrder)-1 + numRules = len(logic) + if numRules > 1: + # multiple rules, need if/else + for ruleIdx in range(0, numRules): + rule = logic[ruleIdx] + threshold = rule[0] + if lowestLevel: + solutionIdx = rule[1] + s += "%sif (size%s < %u) return solution[%u];\n" \ + % (indent, indexChars[indexOrder[depth]], threshold, solutionIdx) + else: + s += "%sif (size%s < %u) {\n" \ + % (indent, indexChars[indexOrder[depth]], threshold) + s += writeLogicRec(depth+1, indexOrder, rule[1], solutionNames) + s += "%s}\n" % (indent) + else: + ruleIdx = 0 + rule = logic[ruleIdx] + threshold = rule[0] + if lowestLevel: + solutionIdx = rule[1] + s += "%sreturn solution[%u];\n" \ + % (indent, solutionIdx) + else: + s += "%s{\n" \ + % (indent) + s += writeLogicRec(depth+1, indexOrder, rule[1], solutionNames) + s += "%s}\n" % (indent) + return s + ################################################################################ # Write CMake @@ -412,11 +458,12 @@ def TensileCreateLibrary(): printExit("LogicPath %s doesn't exist" % logicPath) logicFiles = [os.path.join(logicPath, f) for f in os.listdir(logicPath) \ - if os.path.isfile(os.path.join(logicPath, f))] + if (os.path.isfile(os.path.join(logicPath, f)) \ + and os.path.splitext(f)[1]==".yaml")] - print2("# LibraryLogicFiles:" % logicFiles) + print1("# LibraryLogicFiles:" % logicFiles) for logicFile in logicFiles: - print2("# %s" % logicFile) + print1("# %s" % logicFile) ############################################################################## # Parse config files @@ -424,10 +471,10 @@ def TensileCreateLibrary(): solutions = [] logicList = [] for logicFileName in logicFiles: - (scheduleName, problemType, solutionsForType, skinnyLogic0, skinnyLogic1, \ - diagonalLogic) = YAMLIO.readLibraryLogicForProblemType(logicFileName) + (scheduleName, problemType, solutionsForType, indexOrder, logic) \ + = YAMLIO.readLibraryLogicForProblemType(logicFileName) logicList.append((scheduleName, problemType, solutionsForType, \ - skinnyLogic0, skinnyLogic1, diagonalLogic)) + indexOrder, logic )) for solution in solutionsForType: if solution not in solutions: solutions.append(solution) diff --git a/Tensile/YAMLIO.py b/Tensile/YAMLIO.py index c0f7393b3..a806bd54e 100644 --- a/Tensile/YAMLIO.py +++ b/Tensile/YAMLIO.py @@ -85,27 +85,52 @@ def readSolutions( filename ): # 1 yaml per problem type # problemType, skinny0, skinny1, diagonal ################################################################################ -def writeLibraryLogicForProblemType( filePath, schedulePrefix, logic): - problemType = logic[0] - solutions = logic[1] - skinnyLogic0 = logic[2] - skinnyLogic1 = logic[3] - diagonalLogic = logic[4] +def writeLibraryLogicForProblemType( filePath, schedulePrefix, logicTuple): + problemType = logicTuple[0] + solutions = logicTuple[1] + indexOrder = logicTuple[2] + logic = logicTuple[3] filename = os.path.join(filePath, "%s_%s.yaml" \ % (schedulePrefix, str(problemType))) print2("# writeLogic( %s )" % ( filename )) - # open file + data = [] + # logic name + data.append(globalParameters["Name"]) + # problem type + problemTypeState = problemType.state + problemTypeState["DataType"] = \ + problemTypeState["DataType"].value + data.append(problemTypeState) + # solutions + solutionList = [] + for solution in solutions: + solutionState = solution.state + solutionState["ProblemType"] = solutionState["ProblemType"].state + solutionState["ProblemType"]["DataType"] = \ + solutionState["ProblemType"]["DataType"].value + solutionList.append(solutionState) + data.append(solutionList) + # index order + data.append(indexOrder) + # logic + data.append(logic) + + # open & write file try: stream = open(filename, "w") + #yaml.dump(data, stream, default_flow_style=False) + yaml.dump(data, stream) + stream.close() except IOError: printExit("Cannot open file: %s" % filename) + """ + #data = [ globalParameters["Name"], problemTypeState, [], [], [] ] # write problem type problemTypeState = problemType.state problemTypeState["DataType"] = \ problemTypeState["DataType"].value - data = [ globalParameters["Name"], problemTypeState, [], [], [], [] ] for solution in solutions: solutionState = solution.state solutionState["ProblemType"] = solutionState["ProblemType"].state @@ -122,6 +147,7 @@ def writeLibraryLogicForProblemType( filePath, schedulePrefix, logic): #stream.write(data) yaml.dump(data, stream, default_flow_style=False) stream.close() + """ def readLibraryLogicForProblemType( filename ): @@ -134,19 +160,20 @@ def readLibraryLogicForProblemType( filename ): stream.close() # verify - if len(data) < 6: + if len(data) < 5: printExit("len(%s) %u < 6" % (filename, len(data))) # parse out objects scheduleName = data[0] problemTypeState = data[1] solutionStates = data[2] - skinnyLogic0 = data[3] - skinnyLogic1 = data[4] - diagonalLogic = data[5] + indexOrder = data[3] + logic = data[4] - solutions = [] + # unpack problemType problemType = ProblemType(problemTypeState) + # unpack solutions + solutions = [] for i in range(0, len(solutionStates)): solutionState = solutionStates[i] solutionObject = Solution(solutionState) @@ -155,5 +182,4 @@ def readLibraryLogicForProblemType( filename ): % (problemType, solutionObject["ProblemType"])) solutions.append(solutionObject) - return (scheduleName, problemType, solutions, skinnyLogic0, skinnyLogic1, \ - diagonalLogic) + return (scheduleName, problemType, solutions, indexOrder, logic ) From c73e3ac537099f4c58d5e2e90e36bf1bf0adbe1c Mon Sep 17 00:00:00 2001 From: David Tanner Date: Fri, 3 Mar 2017 08:46:07 -0600 Subject: [PATCH 17/21] recursive solution selection logic working --- Tensile/Common.py | 14 +--- Tensile/LibraryLogic.py | 46 +++++++---- Tensile/TensileCreateLibrary.py | 131 +++++++++++++------------------- Tensile/__init__.py | 2 +- 4 files changed, 90 insertions(+), 103 deletions(-) diff --git a/Tensile/Common.py b/Tensile/Common.py index 9b04526c0..c2b8f54b3 100644 --- a/Tensile/Common.py +++ b/Tensile/Common.py @@ -152,16 +152,10 @@ # Default Analysis Parameters ################################################################################ defaultAnalysisParameters = { - "InitialSolutionWindow": 4, - "BranchWeight": 100, # microseconds / kernel - - "Dilation": 3, - "Threshold": 0.1, - "OutlierThreshold": 0, - "FractionTimeSavedMin": 0.01, # = 1% - "Weight0": 100, - "Weight1": 100, - "Weight2": 100, + "InitialSolutionWindow": 4, + "BranchPenalty": 10000, # microseconds / kernel + "SmoothOutliers": False, # enforce monotonic data + "SolutionImportanceMin": 0.01, # = 1% } diff --git a/Tensile/LibraryLogic.py b/Tensile/LibraryLogic.py index 3016f1c79..823d48452 100644 --- a/Tensile/LibraryLogic.py +++ b/Tensile/LibraryLogic.py @@ -50,8 +50,9 @@ def analyzeProblemType( problemTypeTuple, inputParameters ): ###################################### # Correct outliers - # logicAnalyzer.smooth() - logicAnalyzer.print2D([0, 0]) + if inputParameters["SmoothOutliers"]: + logicAnalyzer.smoothOutliers() + #logicAnalyzer.print2D([0, 0]) ###################################### # Create Rules @@ -65,6 +66,8 @@ def analyzeProblemType( problemTypeTuple, inputParameters ): logicAnalyzer.globalIndexRange, logic) print1("\nScore: %.0f ms" % (score/1000)) + logicAnalyzer.prepareLogic(logic) + #return (skinnyRules01, skinnyRules10, diagonalRules) #return (problemType, logicAnalyzer.solutionsUsed, [], [], logicAnalyzer.diagonalRules ) return (problemType, logicAnalyzer.solutions, logicAnalyzer.indexOrder, logic) @@ -152,9 +155,6 @@ def __init__(self, problemType, problemSizes, solutions, inputParameters): self.rangeIndicesFree = range(0, self.problemType["NumIndicesC"]) self.rangeIndicesSummation = range(self.problemType["NumIndicesC"], \ self.problemType["TotalIndices"]) - self.w0 = self.parameters["Weight0"] - self.w1 = self.parameters["Weight1"] - self.w2 = self.parameters["Weight2"] self.indexOrder = self.recommendedIndexOrder() print2("IndexOrder: %s" % self.indexOrder) self.globalIndexRange = [] @@ -248,7 +248,7 @@ def removeLeastImportantSolutions(self): while True: (lisIdx, lisPercSaved, lisPercWins, lisPercExec) \ = self.leastImportantSolution() - if lisPercSaved < self.parameters["FractionTimeSavedMin"]: + if lisPercSaved < self.parameters["SolutionImportanceMin"]: self.removeSolution(lisIdx) continue else: @@ -256,10 +256,9 @@ def removeLeastImportantSolutions(self): ############################################################################## - # ENTRY: Smooth - correct outliers + # ENTRY: Smooth Outliers ############################################################################## - def smooth(self): - outlierThreshold = self.parameters["OutlierThreshold"] + def smoothOutliers(self): problemSizes = [0]*self.numIndices for problemIndices in self.problemIndicesForGlobalRange: problemSerial = self.indicesToSerial(0, problemIndices) @@ -281,8 +280,8 @@ def smooth(self): neighborAfterGFlops = self.data[neighborAfterIdx+solutionIdx] neighborGFlops.append(neighborBeforeGFlops) neighborGFlops.append(neighborAfterGFlops) - if neighborBeforeGFlops > gflops * (1+outlierThreshold) \ - and neighborAfterGFlops * (1+outlierThreshold) < gflops : + if neighborBeforeGFlops > gflops \ + and neighborAfterGFlops < gflops : smoothProblem = True if smoothProblem: s = "" @@ -442,7 +441,7 @@ def enRule(self, currentIndexIndex, currentIndexRange): logicComplexity = [0]*self.numIndices self.scoreLogicComplexity( \ [priorRuleForSize], logicComplexity) - priorRuleScore += self.parameters["BranchWeight"] \ + priorRuleScore += self.parameters["BranchPenalty"] \ * sum(logicComplexity) # score candidate candidateRuleScore = self.scoreRangeForLogic(nextIndexRange, \ @@ -450,9 +449,9 @@ def enRule(self, currentIndexIndex, currentIndexRange): logicComplexity = [0]*self.numIndices self.scoreLogicComplexity( \ [candidateRule], logicComplexity) - candidateRuleScore += self.parameters["BranchWeight"] \ + candidateRuleScore += self.parameters["BranchPenalty"] \ * sum(logicComplexity) - candidateRuleScore += self.parameters["BranchWeight"] # penalize + candidateRuleScore += self.parameters["BranchPenalty"] # penalize candidateFaster = candidateRuleScore < priorRuleScore print2("%sP[%2u]: %s %s~%.0fus < %s~%.0fus" % (tab, problemIndex, \ "wins" if candidateFaster else "same", \ @@ -487,6 +486,25 @@ def enRule(self, currentIndexIndex, currentIndexRange): ############################################################################## + + ############################################################################## + # Prepare Logic + # convert threshold indices to sizes + # last threshold = -1 + ############################################################################## + def prepareLogic(self, logic): + depth = self.getLogicDepth(logic) + if depth == 0: return + indexIndex = self.numIndices - depth + index = self.indexOrder[indexIndex] + for i in range(0, len(logic)): + if i == len(logic)-1: + logic[i][0] = -1 + else: + logic[i][0] = self.problemIndexToSize[index][logic[i][0]] + self.prepareLogic(logic[i][1]) + + ############################################################################## # Print2D ############################################################################## diff --git a/Tensile/TensileCreateLibrary.py b/Tensile/TensileCreateLibrary.py index a17366852..222a71d55 100644 --- a/Tensile/TensileCreateLibrary.py +++ b/Tensile/TensileCreateLibrary.py @@ -207,6 +207,7 @@ def writeLogic(outputPath, logicList, solutionWriter ): for i in range(0, len(argList)): s += " %s%s" % (argList[i], ",\n" if i < len(argList)-1 else ") {\n\n") + """ indent = " " s += "%ssize_t sizeC = size%s" % ( indent, indexChars[0]) for i in range(1, problemType["NumIndicesC"]): @@ -217,64 +218,11 @@ def writeLogic(outputPath, logicList, solutionWriter ): for i in range(1, len(problemType["IndicesSummation"])): s += "*size%s" % indexChars[problemType["IndicesSummation"][i]] s += ";\n\n" - #for rule in skinnyLogic0: - # print2(rule) - #for rule in skinnyLogic1: - # print2(rule) - print2(solutionNames) - - logicStr = writeLogicRec(0, indexOrder, logic, solutionNames) - print logicStr - printExit("TODO") - #for indexIndex in range(0, problemType["TotalIndices"]): - # index = indexOrder[indexIndex] - # for ruleIdx in range(0, len(logic)): - # rule = logic[ruleIdx] - - """ - print2(rule) - winnerIdx = rule[0] - problemSize = rule[1] - minGFlops = rule[2] - maxGFlops = rule[3] - # rule logic - if ruleIdx == len(diagonalLogic)-1: - if len(diagonalLogic) > 1: - s += "%selse" % indent - else: - s += "%s" % indent - else: - s += "%s%s(sizeC >= static_cast(%u" % (indent, ("if" if ruleIdx == 0 else "else if"), problemSize[0]) - for i in range(1, problemType["NumIndicesC"]): - s += "*%u" % problemSize[i] - s += "))" - s += " return %s(" % solutionNames[winnerIdx] - # solution parameters - s += " dataC, dataA, dataB, alpha" - if problemType["UseBeta"]: - s += ", beta" - s += ", offsetC, offsetA, offsetB" - firstStride = 1 - if problemType["UseInitialStrides"]: - firstStride = 0 - lastStrideC = problemType["NumIndicesC"] - lastStrideA = len(problemType["IndexAssignmentsA"]) - lastStrideB = len(problemType["IndexAssignmentsB"]) - - for i in range(firstStride,lastStrideC): - s += ", strideC%u%s" % (i, indexChars[i]) - for i in range(firstStride,lastStrideA): - s += ", strideA%u%s" % (i, \ - indexChars[problemType["IndexAssignmentsA"][i]]) - for i in range(firstStride,lastStrideB): - s += ", strideB%u%s" % (i, \ - indexChars[problemType["IndexAssignmentsB"][i]]) - for i in range(0, problemType["TotalIndices"]): - s += ", size%s" % indexChars[i] - s += ", stream, numInputEvents, inputEvents, outputEvent ); /* [%f,%f] GFlops*/\n" % (minGFlops,maxGFlops) + print2(solutionNames) - """ + logicStr = writeLogicRec(0, indexOrder, logic, solutionNames, problemType) + s += logicStr s += "\n}\n" # open and close individual files @@ -299,43 +247,70 @@ def writeLogic(outputPath, logicList, solutionWriter ): ################################################################################ # Write Logic Recursive ################################################################################ -def writeLogicRec(depth, indexOrder, logic, solutionNames): +def writeLogicRec(depth, indexOrder, logic, solutionNames, problemType): indexChars = globalParameters["IndexChars"] indent = " " indent += " "*depth s = "" lowestLevel = depth == len(indexOrder)-1 numRules = len(logic) - if numRules > 1: - # multiple rules, need if/else - for ruleIdx in range(0, numRules): - rule = logic[ruleIdx] - threshold = rule[0] - if lowestLevel: - solutionIdx = rule[1] - s += "%sif (size%s < %u) return solution[%u];\n" \ - % (indent, indexChars[indexOrder[depth]], threshold, solutionIdx) - else: - s += "%sif (size%s < %u) {\n" \ - % (indent, indexChars[indexOrder[depth]], threshold) - s += writeLogicRec(depth+1, indexOrder, rule[1], solutionNames) - s += "%s}\n" % (indent) - else: - ruleIdx = 0 + for ruleIdx in range(0, numRules): rule = logic[ruleIdx] threshold = rule[0] if lowestLevel: solutionIdx = rule[1] - s += "%sreturn solution[%u];\n" \ - % (indent, solutionIdx) + solutionCall = writeSolutionCall(solutionNames[solutionIdx],problemType) + if threshold > 0: + s += "%sif (size%s < %u) return %s;\n" \ + % (indent, indexChars[indexOrder[depth]], threshold, solutionCall) + else: + s += "%sreturn %s;\n" % (indent, solutionCall) else: - s += "%s{\n" \ - % (indent) - s += writeLogicRec(depth+1, indexOrder, rule[1], solutionNames) + if threshold > 0: + s += "%sif (size%s < %u) {\n" \ + % (indent, indexChars[indexOrder[depth]], threshold) + else: + s += "%s{\n" % (indent) + s += writeLogicRec(depth+1, indexOrder, rule[1], solutionNames, \ + problemType) s += "%s}\n" % (indent) return s +################################################################################ +# Write Solution Call +################################################################################ +def writeSolutionCall(solutionName, problemType): + indexChars = globalParameters["IndexChars"] + s = "" + s += "%s(" % solutionName + # solution parameters + s += " dataC, dataA, dataB, alpha" + if problemType["UseBeta"]: + s += ", beta" + s += ", offsetC, offsetA, offsetB" + firstStride = 1 + if problemType["UseInitialStrides"]: + firstStride = 0 + lastStrideC = problemType["NumIndicesC"] + lastStrideA = len(problemType["IndexAssignmentsA"]) + lastStrideB = len(problemType["IndexAssignmentsB"]) + for i in range(firstStride,lastStrideC): + s += ", strideC%u%s" % (i, indexChars[i]) + for i in range(firstStride,lastStrideA): + s += ", strideA%u%s" % (i, \ + indexChars[problemType["IndexAssignmentsA"][i]]) + for i in range(firstStride,lastStrideB): + s += ", strideB%u%s" % (i, \ + indexChars[problemType["IndexAssignmentsB"][i]]) + for i in range(0, problemType["TotalIndices"]): + s += ", size%s" % indexChars[i] + s += ", stream, numInputEvents, inputEvents, outputEvent )" + return s + + + + ################################################################################ # Write CMake ################################################################################ diff --git a/Tensile/__init__.py b/Tensile/__init__.py index edc60b350..8a124bf64 100644 --- a/Tensile/__init__.py +++ b/Tensile/__init__.py @@ -1 +1 @@ -__version__ = "2.1.6" +__version__ = "2.2.0" From 88636743bd2c6687ef0a3c070dc8b86097acb8b9 Mon Sep 17 00:00:00 2001 From: David Tanner Date: Fri, 3 Mar 2017 09:13:26 -0600 Subject: [PATCH 18/21] added api timer to library client, runscript prints path to library client --- Tensile/ClientWriter.py | 13 ++++++++----- Tensile/Source/Client.h | 43 ++++++++++++++++++++++++++--------------- 2 files changed, 35 insertions(+), 21 deletions(-) diff --git a/Tensile/ClientWriter.py b/Tensile/ClientWriter.py index 2ee9ba083..dd6e746fa 100644 --- a/Tensile/ClientWriter.py +++ b/Tensile/ClientWriter.py @@ -142,13 +142,16 @@ def writeRunScript(path, libraryLogicPath, forBenchmark): if os.name != "nt" else "") ) #if os.name != "nt": # runScriptFile.write("find .\n") - runScriptFile.write("%s & echo %s & echo # Running Client & echo %s\n" \ - % (echoLine, HR, HR)) + # % (echoLine, HR, HR)) + executablePath = os.path.join(globalParameters["WorkingPath"]) if os.name == "nt": - runScriptFile.write(os.path.join(globalParameters["CMakeBuildType"], \ - "client.exe") ) + executablePath = os.path.join(executablePath, \ + globalParameters["CMakeBuildType"], \ + "client.exe") else: - runScriptFile.write("./client") + executablePath = os.path.join(executablePath, "client") + runScriptFile.write("%s & echo %s & echo # Library Client Path: & echo %s\n" \ + % (echoLine, HR, executablePath) ) runScriptFile.close() if os.name != "nt": os.chmod(runScriptName, 0777) diff --git a/Tensile/Source/Client.h b/Tensile/Source/Client.h index eee665ca4..e0fc00446 100644 --- a/Tensile/Source/Client.h +++ b/Tensile/Source/Client.h @@ -30,6 +30,7 @@ #include TensileTimer timer; +TensileTimer apiTimer; std::ofstream file; void initControls(); @@ -135,7 +136,8 @@ bool callLibrary( std::cout << " Device | Reference" << std::endl; firstPrint = false; } - std::cout << "[" << (numChecked-1) << "] " << i << ": " << tensileToString(deviceOnHostC[i]) + std::cout << "[" << (numChecked-1) << "] " << i << ": " + << tensileToString(deviceOnHostC[i]) << (equal ? "==" : "!=") << tensileToString(referenceC[i]) << std::endl; printIdx++; @@ -146,10 +148,14 @@ bool callLibrary( // time solution timer.start(); + double apiTimeUs = 0; for (unsigned int syncIdx = 0; syncIdx < numSyncsPerBenchmark; syncIdx++) { + apiTimer.start(); for (unsigned int enqIdx = 0; enqIdx < numEnqueuesPerSync; enqIdx++) { generatedCallToFunction( userSizes, alpha, beta ); } + double currentApiTimeUs = apiTimer.elapsed_us() / numEnqueuesPerSync; + apiTimeUs += currentApiTimeUs; // sync #if Tensile_BACKEND_OCL status = clFinish(stream); tensileStatusCheck(status); @@ -174,30 +180,35 @@ bool callLibrary( << numFunctions << "]:" << std::setw(10) << std::fixed << std::setprecision(3) << gflops << " GFlop/s"; - if (newFastest) { - std::cout << "*"; - } else { - std::cout << " "; - } + if (newFastest) { + std::cout << "*"; + } else { + std::cout << " "; + } std::cout << " |" << std::setw(9) << std::fixed << std::setprecision(3) << timeMs << " ms | v: " << (numInvalids ? "FAILED" : "PASSED") - << " p: " << (numChecked-numInvalids) << "/" << numChecked << std::endl; + << " p: " << (numChecked-numInvalids) << "/" << numChecked; + std::cout << " api:" << std::setw(6) << std::fixed + << std::setprecision(3) << apiTimeUs << " us"; + std::cout << std::endl; } else { std::cout << "Function[" << functionIdx << "/" << numFunctions << "]:" << std::setw(10) << std::fixed << std::setprecision(3) << gflops << " GFlop/s"; - if (newFastest) { - std::cout << "*"; - } else { - std::cout << " "; - } + if (newFastest) { + std::cout << "*"; + } else { + std::cout << " "; + } std::cout << " |" << std::setw(9) << std::fixed << std::setprecision(3) << timeMs << " ms"; - if (newFastest) { - std::cout << "*"; - } - std::cout << std::endl; + if (newFastest) { + std::cout << "*"; + } + std::cout << " api:" << std::setw(6) << std::fixed + << std::setprecision(3) << apiTimeUs << " us"; + std::cout << std::endl; } return (numInvalids > 0); } // callLibrary From 71edbd658fbea17fa44ed285e344133540072235 Mon Sep 17 00:00:00 2001 From: David Tanner Date: Fri, 3 Mar 2017 10:04:01 -0600 Subject: [PATCH 19/21] client prints api time --- Tensile/Source/Client.h | 9 +++++---- 1 file changed, 5 insertions(+), 4 deletions(-) diff --git a/Tensile/Source/Client.h b/Tensile/Source/Client.h index e0fc00446..401b6ab8f 100644 --- a/Tensile/Source/Client.h +++ b/Tensile/Source/Client.h @@ -164,6 +164,7 @@ bool callLibrary( #endif tensileStatusCheck(status); } // sync loop + apiTimeUs /= numSyncsPerBenchmark; double timeMs = timer.elapsed_ms() / numSyncsPerBenchmark / numEnqueuesPerSync; @@ -188,8 +189,8 @@ bool callLibrary( std::cout << " |" << std::setw(9) << std::fixed << std::setprecision(3) << timeMs << " ms | v: " << (numInvalids ? "FAILED" : "PASSED") - << " p: " << (numChecked-numInvalids) << "/" << numChecked; - std::cout << " api:" << std::setw(6) << std::fixed + << " " << (numChecked-numInvalids) << "/" << numChecked; + std::cout << " | api:" << std::setw(6) << std::fixed << std::setprecision(3) << apiTimeUs << " us"; std::cout << std::endl; } else { @@ -206,7 +207,7 @@ bool callLibrary( if (newFastest) { std::cout << "*"; } - std::cout << " api:" << std::setw(6) << std::fixed + std::cout << " | api:" << std::setw(6) << std::fixed << std::setprecision(3) << apiTimeUs << " us"; std::cout << std::endl; } @@ -370,7 +371,7 @@ bool benchmarkAllSolutionsForSize( } std::cout << " |" << std::setw(9) << std::fixed << std::setprecision(3) << timeMs << " ms | v: " << (numInvalids ? "FAILED" : "PASSED") - << " p: " << (numChecked-numInvalids) << "/" << numChecked << std::endl; + << " " << (numChecked-numInvalids) << "/" << numChecked << std::endl; } #if 1 else { From 40d5315765da6fa24d986c31ae76039d09b17240 Mon Sep 17 00:00:00 2001 From: David Tanner Date: Fri, 3 Mar 2017 13:42:31 -0600 Subject: [PATCH 20/21] fixed clientwriter executing exe for benchmarks but printing exe only for library --- Tensile/ClientWriter.py | 26 ++++++++++++++----------- Tensile/Configs/rocblas_cgemm.yaml | 3 +-- Tensile/Configs/rocblas_dgemm.yaml | 3 +-- Tensile/Configs/rocblas_sgemm.yaml | 3 +-- Tensile/Configs/rocblas_zgemm.yaml | 3 +-- Tensile/Configs/sgemm.yaml | 3 +-- Tensile/Configs/tensor_contraction.yaml | 3 +-- 7 files changed, 21 insertions(+), 23 deletions(-) diff --git a/Tensile/ClientWriter.py b/Tensile/ClientWriter.py index dd6e746fa..de391a40e 100644 --- a/Tensile/ClientWriter.py +++ b/Tensile/ClientWriter.py @@ -140,18 +140,22 @@ def writeRunScript(path, libraryLogicPath, forBenchmark): runScriptFile.write("cmake --build . --config %s%s\n" \ % (globalParameters["CMakeBuildType"], " -- -j 8" \ if os.name != "nt" else "") ) - #if os.name != "nt": - # runScriptFile.write("find .\n") - # % (echoLine, HR, HR)) - executablePath = os.path.join(globalParameters["WorkingPath"]) - if os.name == "nt": - executablePath = os.path.join(executablePath, \ - globalParameters["CMakeBuildType"], \ - "client.exe") + if forBenchmark: + if os.name == "nt": + runScriptFile.write(os.path.join(globalParameters["CMakeBuildType"], \ + "client.exe") ) + else: + runScriptFile.write("./client") else: - executablePath = os.path.join(executablePath, "client") - runScriptFile.write("%s & echo %s & echo # Library Client Path: & echo %s\n" \ - % (echoLine, HR, executablePath) ) + executablePath = os.path.join(globalParameters["WorkingPath"]) + if os.name == "nt": + executablePath = os.path.join(executablePath, \ + globalParameters["CMakeBuildType"], \ + "client.exe") + else: + executablePath = os.path.join(executablePath, "client") + runScriptFile.write("%s & echo %s & echo # Library Client Path: & echo %s\n" \ + % (echoLine, HR, executablePath) ) runScriptFile.close() if os.name != "nt": os.chmod(runScriptName, 0777) diff --git a/Tensile/Configs/rocblas_cgemm.yaml b/Tensile/Configs/rocblas_cgemm.yaml index 3c0fbd958..40e68b3a2 100644 --- a/Tensile/Configs/rocblas_cgemm.yaml +++ b/Tensile/Configs/rocblas_cgemm.yaml @@ -151,7 +151,6 @@ BenchmarkProblems: - ProblemSizes: [ [32, 32, 32, 4000], [32, 32, 32, 4000], [2], [1536] ] LibraryLogic: - Dilation: 3 - Threshold: 0.1 + BranchPenalty: 0 LibraryClient: diff --git a/Tensile/Configs/rocblas_dgemm.yaml b/Tensile/Configs/rocblas_dgemm.yaml index f4bc58257..51e8eea11 100644 --- a/Tensile/Configs/rocblas_dgemm.yaml +++ b/Tensile/Configs/rocblas_dgemm.yaml @@ -151,7 +151,6 @@ BenchmarkProblems: - ProblemSizes: [ [32, 32, 32, 4000], [32, 32, 32, 4000], [2], [1536] ] LibraryLogic: - Dilation: 3 - Threshold: 0.1 + BranchPenalty: 0 LibraryClient: diff --git a/Tensile/Configs/rocblas_sgemm.yaml b/Tensile/Configs/rocblas_sgemm.yaml index fedec0deb..c4ed34214 100644 --- a/Tensile/Configs/rocblas_sgemm.yaml +++ b/Tensile/Configs/rocblas_sgemm.yaml @@ -151,7 +151,6 @@ BenchmarkProblems: - ProblemSizes: [ [32, 32, 32, 4000], [32, 32, 32, 4000], [2], [2880] ] LibraryLogic: - Dilation: 3 - Threshold: 0.1 + BranchPenalty: 0 LibraryClient: diff --git a/Tensile/Configs/rocblas_zgemm.yaml b/Tensile/Configs/rocblas_zgemm.yaml index 64f0f6af9..4082ed6c8 100644 --- a/Tensile/Configs/rocblas_zgemm.yaml +++ b/Tensile/Configs/rocblas_zgemm.yaml @@ -151,7 +151,6 @@ BenchmarkProblems: - ProblemSizes: [ [32, 32, 32, 4000], [32, 32, 32, 4000], [2], [1536] ] LibraryLogic: - Dilation: 3 - Threshold: 0.1 + BranchPenalty: 0 LibraryClient: diff --git a/Tensile/Configs/sgemm.yaml b/Tensile/Configs/sgemm.yaml index 7d2805e4a..f9ddad0c5 100644 --- a/Tensile/Configs/sgemm.yaml +++ b/Tensile/Configs/sgemm.yaml @@ -49,7 +49,6 @@ BenchmarkProblems: - ProblemSizes: [ [16, 128], [16, 128], [256] ] LibraryLogic: - Dilation: 3 - Threshold: 0.1 + BranchPenalty: 0 LibraryClient: diff --git a/Tensile/Configs/tensor_contraction.yaml b/Tensile/Configs/tensor_contraction.yaml index 8201d0bec..aa5e779f4 100644 --- a/Tensile/Configs/tensor_contraction.yaml +++ b/Tensile/Configs/tensor_contraction.yaml @@ -50,7 +50,6 @@ BenchmarkProblems: - ProblemSizes: [ [16, 128], [16, 128], [2, 2, 4], [256] ] LibraryLogic: - Dilation: 3 - Threshold: 0.1 + BranchPenalty: 0 LibraryClient: From 900d32fb8b811cbd0a9ee462f24e984688ab5e3d Mon Sep 17 00:00:00 2001 From: David Tanner Date: Fri, 3 Mar 2017 15:50:15 -0600 Subject: [PATCH 21/21] lowering default BranchPenalty since logic is inexpensive (1.5us) --- Tensile/Common.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/Tensile/Common.py b/Tensile/Common.py index c2b8f54b3..2e3eb3151 100644 --- a/Tensile/Common.py +++ b/Tensile/Common.py @@ -153,7 +153,7 @@ ################################################################################ defaultAnalysisParameters = { "InitialSolutionWindow": 4, - "BranchPenalty": 10000, # microseconds / kernel + "BranchPenalty": 100, # microseconds / kernel "SmoothOutliers": False, # enforce monotonic data "SolutionImportanceMin": 0.01, # = 1% }