Skip to content

Commit

Permalink
Merge pull request #75 from guacamoleo/develop
Browse files Browse the repository at this point in the history
v2.2 new solution selection logic
  • Loading branch information
guacamoleo authored Mar 3, 2017
2 parents febbca8 + 900d32f commit 48610a1
Show file tree
Hide file tree
Showing 18 changed files with 1,358 additions and 789 deletions.
20 changes: 13 additions & 7 deletions Tensile/BenchmarkProblems.py
Original file line number Diff line number Diff line change
Expand Up @@ -37,7 +37,7 @@ def benchmarkProblemType( config ):

totalBenchmarkSteps = len(benchmarkProcess)
winners = WinningParameterDict()
print1("NumBenchmarkSteps: %u" % totalBenchmarkSteps)
print1("# NumBenchmarkSteps: %u" % totalBenchmarkSteps)
print1("")
print1(HR)
print1("# Done Creating BenchmarkProcess Object")
Expand Down Expand Up @@ -148,6 +148,7 @@ def benchmarkProblemType( config ):
# Enumerate Solutions = Hardcoded * Benchmark
############################################################################
sys.stdout.write("# Enumerating Solutions")
solutionSet = set() # avoid duplicates for nlca=-1, 1
for hardcodedIdx in range(0, numHardcoded):
solutions.append([])
hardcodedParamDict = benchmarkStep.hardcodedParameters[hardcodedIdx]
Expand All @@ -169,10 +170,15 @@ def benchmarkProblemType( config ):
benchmarkStep.initialSolutionParameters[initialSolutionParameterName]
# TODO check if solution matches problem size for exact tile kernels
solutionObject = Solution(solution)
if SolutionWriter.solutionParametersConsistent(solutionObject):
solutions[hardcodedIdx].append(solutionObject)
if globalParameters["PrintLevel"] >= 1:
sys.stdout.write("|")
if solutionObject["Valid"]:
if solutionObject not in solutionSet:
solutionSet.add(solutionObject)
solutions[hardcodedIdx].append(solutionObject)
if globalParameters["PrintLevel"] >= 1:
sys.stdout.write("|")
else:
if globalParameters["PrintLevel"] >= 1:
sys.stdout.write(":")
else:
if globalParameters["PrintLevel"] >= 1:
sys.stdout.write(".")
Expand Down Expand Up @@ -541,8 +547,8 @@ def get( lookupHardcodedParameters, winners ):
#for paramName in hardcodedFrozen:
# paramValue = hardcodedFrozen[paramName]
# matchUnion[paramName] = paramValue
Solution.assignDimsFromEdgeAndShape(matchUnion)
Solution.assignDimsFromEdgeAndShape(hardcodedFrozen.parameters)
Solution.assignProblemIndependentDerivedParameters(matchUnion)
Solution.assignProblemIndependentDerivedParameters(hardcodedFrozen.parameters)
if matchUnion["MacroTile0"] != lookupMacroTile0 \
or matchUnion["MacroTile1"] != lookupMacroTile1:
matchMacroTile = False
Expand Down
14 changes: 8 additions & 6 deletions Tensile/BenchmarkStructs.py
Original file line number Diff line number Diff line change
Expand Up @@ -2,7 +2,7 @@
# from copy import *

from copy import copy, deepcopy
from Common import print1, print2, printWarning, defaultSolution, defaultProblemSizes, defaultBenchmarkFinalProblemSizes, defaultBenchmarkCommonParameters, hasParam, defaultBenchmarkJoinParameters, getParamValues, defaultForkParameters, defaultBenchmarkForkParameters, defaultJoinParameters, printExit
from Common import print1, print2, printWarning, defaultSolution, defaultProblemSizes, defaultBenchmarkFinalProblemSizes, defaultBenchmarkCommonParameters, hasParam, defaultBenchmarkJoinParameters, getParamValues, defaultForkParameters, defaultBenchmarkForkParameters, defaultJoinParameters, printExit, globalParameters
from SolutionStructs import Solution, ProblemType, ProblemSizes

################################################################################
Expand Down Expand Up @@ -399,14 +399,16 @@ def convertParametersToSteps(self):
macroTileDim0 = workGroupEdgeValues[workGroupEdgeIdx]*threadTileEdgeValues[threadTileEdgeIdx]
macroTileDim1 = macroTileDim0
if workGroupShapeValues[workGroupShapeIdx] < 0:
macroTileDim1 /= 2
macroTileDim0 *= abs(workGroupShapeValues[workGroupShapeIdx])
elif workGroupShapeValues[workGroupShapeIdx] > 0:
macroTileDim1 *= 2
macroTileDim1 *= abs(workGroupShapeValues[workGroupShapeIdx])
if threadTileShapeValues[threadTileShapeIdx] < 0:
macroTileDim1 /= 2
macroTileDim0 *= abs(threadTileShapeValues[threadTileShapeIdx])
elif threadTileShapeValues[threadTileShapeIdx] > 0:
macroTileDim1 *= 2
if macroTileDim0/macroTileDim1 <= self.initialSolutionParameters["MacroTileMaxRatio"] and macroTileDim1/macroTileDim0 <= self.initialSolutionParameters["MacroTileMaxRatio"]:
macroTileDim1 *= abs(threadTileShapeValues[threadTileShapeIdx])
# TODO is this still useful?
if macroTileDim0/macroTileDim1 <= globalParameters["MaxMacroTileRatio"] \
and macroTileDim1/macroTileDim0 <= globalParameters["MaxMacroTileRatio"]:
macroTileJoinSet.add((macroTileDim0, macroTileDim1))
totalPermutations *=len(macroTileJoinSet)
print2("JoinMacroTileSet(%u): %s" % (len(macroTileJoinSet), macroTileJoinSet) )
Expand Down
32 changes: 20 additions & 12 deletions Tensile/ClientWriter.py
Original file line number Diff line number Diff line change
@@ -1,4 +1,4 @@
from Common import globalParameters, HR, pushWorkingPath, popWorkingPath, print1, CHeader, printExit
from Common import globalParameters, HR, pushWorkingPath, popWorkingPath, print1, CHeader, printExit, printWarning
from SolutionStructs import Solution
from SolutionWriter import SolutionWriter
import YAMLIO
Expand Down Expand Up @@ -54,13 +54,14 @@ def main( config ):
##############################################################################
logicFiles = [os.path.join(libraryLogicPath, f) for f \
in os.listdir(libraryLogicPath) \
if os.path.isfile(os.path.join(libraryLogicPath, f))]
if (os.path.isfile(os.path.join(libraryLogicPath, f)) \
and os.path.splitext(f)[1]==".yaml")]
print1("LogicFiles: %s" % logicFiles)
functions = []
functionNames = []
for logicFileName in logicFiles:
(scheduleName, problemType, solutionsForType, skinnyLogic0, skinnyLogic1, \
diagonalLogic) = YAMLIO.readLibraryLogicForProblemType(logicFileName)
(scheduleName, problemType, solutionsForType, indexOrder, logic) \
= YAMLIO.readLibraryLogicForProblemType(logicFileName)
functions.append((scheduleName, problemType))
functionNames.append("tensile_%s_%s" % (scheduleName, problemType))

Expand Down Expand Up @@ -139,15 +140,22 @@ def writeRunScript(path, libraryLogicPath, forBenchmark):
runScriptFile.write("cmake --build . --config %s%s\n" \
% (globalParameters["CMakeBuildType"], " -- -j 8" \
if os.name != "nt" else "") )
#if os.name != "nt":
# runScriptFile.write("find .\n")
runScriptFile.write("%s & echo %s & echo # Running Client & echo %s\n" \
% (echoLine, HR, HR))
if os.name == "nt":
runScriptFile.write(os.path.join(globalParameters["CMakeBuildType"], \
"client.exe") )
if forBenchmark:
if os.name == "nt":
runScriptFile.write(os.path.join(globalParameters["CMakeBuildType"], \
"client.exe") )
else:
runScriptFile.write("./client")
else:
runScriptFile.write("./client")
executablePath = os.path.join(globalParameters["WorkingPath"])
if os.name == "nt":
executablePath = os.path.join(executablePath, \
globalParameters["CMakeBuildType"], \
"client.exe")
else:
executablePath = os.path.join(executablePath, "client")
runScriptFile.write("%s & echo %s & echo # Library Client Path: & echo %s\n" \
% (echoLine, HR, executablePath) )
runScriptFile.close()
if os.name != "nt":
os.chmod(runScriptName, 0777)
Expand Down
29 changes: 17 additions & 12 deletions Tensile/Common.py
Original file line number Diff line number Diff line change
Expand Up @@ -49,8 +49,10 @@
globalParameters["DataInitType"] = 0 # 0=rand, 1=1, 2=serial
# protect against invalid kernel
globalParameters["MaxThreads"] = 256
globalParameters["MaxRegisters"] = 256
globalParameters["MinThreads"] = 64
globalParameters["MaxLDS"] = 32768
globalParameters["MaxMacroTileRatio"] = 4
globalParameters["MaxThreadTile"] = 64


################################################################################
Expand All @@ -67,15 +69,17 @@
{"EdgeType": [ "Branch" ] }, # Shift
{"EdgeMultiKernel": [ False ] },
{"PadLDS": [ 1 ] },
{"SplitU": [ 1 ] },
{"Prefetch": [ False ] },
]
# benchmark these solution independently
defaultForkParameters = [
{"WorkGroupEdge": [ 16, 8 ] },
{"WorkGroupShape": [ 0 ] }, # -1, 0, 1
{"WorkGroupShape": [ 0 ] }, # -4, -2, 0, 2, 4
{"ThreadTileEdge": [ 1, 2, 4, 6, 8 ] },
{"ThreadTileShape": [ 0 ] }, # -1, 0, 1
{"SplitU": [ 1 ] },
{"Prefetch": [ False ] },
{"ThreadTileShape": [ 0 ] }, # -4, -2, 0, 2, 4
{"NumLoadsCoalescedA": [ 1, -1 ] },
{"NumLoadsCoalescedB": [ 1, -1 ] },
]
# keep one winner per solution and it affects which will win
defaultBenchmarkForkParameters = [
Expand All @@ -88,18 +92,17 @@
]
# keep one winner per solution and it would affect which solutions fastest
defaultBenchmarkJoinParameters = [
{"NumLoadsCoalescedA": [ 1, 2, 3, 4, 6, 8 ] },
{"NumLoadsCoalescedB": [ 1, 2, 3, 4, 6, 8 ] },
{"VectorWidthGlobalLoad": [ 4 ] },
{"VectorWidthGlobalStore": [ 4 ] },
{"VectorWidthLocalLoad": [ 4 ] },
{"VectorWidthLocalStore": [ 4 ] },
]

# derrived parameters may show up in solution dict but don't use for naming
derrivedParameters = [
# derived parameters may show up in solution dict but don't use for naming
derivedParameters = [
"MacroTile0",
"MacroTile1",
"DepthU",
"WorkGroup0",
"WorkGroup1",
"ThreadTile0",
Expand All @@ -108,6 +111,7 @@
"NumLoadsB",
"NumLoadsPerpendicularA",
"NumLoadsPerpendicularB",
"NumThreads",
]

# dictionary of defaults comprised for 1st option for each parameter
Expand All @@ -118,7 +122,6 @@
for key, value in paramDict.iteritems():
defaultSolution[key] = value[0]
# other non-benchmark options for solutions
defaultSolution["MacroTileMaxRatio"] = 2

################################################################################
# Default Problem Type
Expand Down Expand Up @@ -149,8 +152,10 @@
# Default Analysis Parameters
################################################################################
defaultAnalysisParameters = {
"Dilation": 3,
"Threshold": 0.1,
"InitialSolutionWindow": 4,
"BranchPenalty": 100, # microseconds / kernel
"SmoothOutliers": False, # enforce monotonic data
"SolutionImportanceMin": 0.01, # = 1%
}


Expand Down
19 changes: 9 additions & 10 deletions Tensile/Configs/rocblas_cgemm.yaml
Original file line number Diff line number Diff line change
Expand Up @@ -41,8 +41,8 @@ BenchmarkProblems:
- ThreadTileShape: [ 0 ]
- WorkGroupEdge: [8, 16]
- ThreadTileEdge: [2, 4, 6, 8]
- NumLoadsCoalescedA: [ 1, 2, 3, 4, 6, 8 ]
- NumLoadsCoalescedB: [ 1, 2, 3, 4, 6, 8 ]
- NumLoadsCoalescedA: [ 1, -1 ]
- NumLoadsCoalescedB: [ 1, -1 ]
- LoopUnroll: [8, 16]
BenchmarkForkParameters:
JoinParameters:
Expand Down Expand Up @@ -74,8 +74,8 @@ BenchmarkProblems:
- WorkGroupShape: [ 0 ]
- ThreadTileEdge: [2, 4, 6, 8]
- ThreadTileShape: [ 0 ]
- NumLoadsCoalescedA: [ 1, 2, 3, 4, 6, 8 ]
- NumLoadsCoalescedB: [ 1, 2, 3, 4, 6, 8 ]
- NumLoadsCoalescedA: [ 1, -1 ]
- NumLoadsCoalescedB: [ 1, -1 ]
- LoopUnroll: [8, 16]
BenchmarkForkParameters:
JoinParameters:
Expand Down Expand Up @@ -107,8 +107,8 @@ BenchmarkProblems:
- WorkGroupShape: [ 0 ]
- ThreadTileEdge: [2, 4, 6, 8]
- ThreadTileShape: [ 0 ]
- NumLoadsCoalescedA: [ 1, 2, 3, 4, 6, 8 ]
- NumLoadsCoalescedB: [ 1, 2, 3, 4, 6, 8 ]
- NumLoadsCoalescedA: [ 1, -1 ]
- NumLoadsCoalescedB: [ 1, -1 ]
- LoopUnroll: [8, 16]
BenchmarkForkParameters:
JoinParameters:
Expand Down Expand Up @@ -140,8 +140,8 @@ BenchmarkProblems:
- WorkGroupShape: [ 0 ]
- ThreadTileEdge: [2, 4, 6, 8]
- ThreadTileShape: [ 0 ]
- NumLoadsCoalescedA: [ 1, 2, 3, 4, 6, 8 ]
- NumLoadsCoalescedB: [ 1, 2, 3, 4, 6, 8 ]
- NumLoadsCoalescedA: [ 1, -1 ]
- NumLoadsCoalescedB: [ 1, -1 ]
- LoopUnroll: [8, 16]
BenchmarkForkParameters:
JoinParameters:
Expand All @@ -151,7 +151,6 @@ BenchmarkProblems:
- ProblemSizes: [ [32, 32, 32, 4000], [32, 32, 32, 4000], [2], [1536] ]

LibraryLogic:
Dilation: 3
Threshold: 0.1
BranchPenalty: 0

LibraryClient:
19 changes: 9 additions & 10 deletions Tensile/Configs/rocblas_dgemm.yaml
Original file line number Diff line number Diff line change
Expand Up @@ -41,8 +41,8 @@ BenchmarkProblems:
- ThreadTileShape: [ 0 ]
- WorkGroupEdge: [8, 16]
- ThreadTileEdge: [2, 4, 6, 8]
- NumLoadsCoalescedA: [ 1, 2, 3, 4, 6, 8 ]
- NumLoadsCoalescedB: [ 1, 2, 3, 4, 6, 8 ]
- NumLoadsCoalescedA: [ 1, -1 ]
- NumLoadsCoalescedB: [ 1, -1 ]
- LoopUnroll: [8, 16]
BenchmarkForkParameters:
JoinParameters:
Expand Down Expand Up @@ -74,8 +74,8 @@ BenchmarkProblems:
- WorkGroupShape: [ 0 ]
- ThreadTileEdge: [2, 4, 6, 8]
- ThreadTileShape: [ 0 ]
- NumLoadsCoalescedA: [ 1, 2, 3, 4, 6, 8 ]
- NumLoadsCoalescedB: [ 1, 2, 3, 4, 6, 8 ]
- NumLoadsCoalescedA: [ 1, -1 ]
- NumLoadsCoalescedB: [ 1, -1 ]
- LoopUnroll: [8, 16]
BenchmarkForkParameters:
JoinParameters:
Expand Down Expand Up @@ -107,8 +107,8 @@ BenchmarkProblems:
- WorkGroupShape: [ 0 ]
- ThreadTileEdge: [2, 4, 6, 8]
- ThreadTileShape: [ 0, 1 ]
- NumLoadsCoalescedA: [ 1, 2, 3, 4, 6, 8 ]
- NumLoadsCoalescedB: [ 1, 2, 3, 4, 6, 8 ]
- NumLoadsCoalescedA: [ 1, -1 ]
- NumLoadsCoalescedB: [ 1, -1 ]
- LoopUnroll: [8, 16]
BenchmarkForkParameters:
JoinParameters:
Expand Down Expand Up @@ -140,8 +140,8 @@ BenchmarkProblems:
- WorkGroupShape: [ 0 ]
- ThreadTileEdge: [2, 4, 6, 8]
- ThreadTileShape: [ 0 ]
- NumLoadsCoalescedA: [ 1, 2, 3, 4, 6, 8 ]
- NumLoadsCoalescedB: [ 1, 2, 3, 4, 6, 8 ]
- NumLoadsCoalescedA: [ 1, -1 ]
- NumLoadsCoalescedB: [ 1, -1 ]
- LoopUnroll: [8, 16]
BenchmarkForkParameters:
JoinParameters:
Expand All @@ -151,7 +151,6 @@ BenchmarkProblems:
- ProblemSizes: [ [32, 32, 32, 4000], [32, 32, 32, 4000], [2], [1536] ]

LibraryLogic:
Dilation: 3
Threshold: 0.1
BranchPenalty: 0

LibraryClient:
19 changes: 9 additions & 10 deletions Tensile/Configs/rocblas_sgemm.yaml
Original file line number Diff line number Diff line change
Expand Up @@ -41,8 +41,8 @@ BenchmarkProblems:
- ThreadTileShape: [ 0 ]
- WorkGroupEdge: [8, 16]
- ThreadTileEdge: [2, 4, 6, 8]
- NumLoadsCoalescedA: [ 1, 2, 3, 4, 6, 8 ]
- NumLoadsCoalescedB: [ 1, 2, 3, 4, 6, 8 ]
- NumLoadsCoalescedA: [ 1, -1 ]
- NumLoadsCoalescedB: [ 1, -1 ]
- LoopUnroll: [8, 16]
BenchmarkForkParameters:
JoinParameters:
Expand Down Expand Up @@ -74,8 +74,8 @@ BenchmarkProblems:
- WorkGroupShape: [ 0 ]
- ThreadTileEdge: [2, 4, 6, 8]
- ThreadTileShape: [ 0 ]
- NumLoadsCoalescedA: [ 1, 2, 3, 4, 6, 8 ]
- NumLoadsCoalescedB: [ 1, 2, 3, 4, 6, 8 ]
- NumLoadsCoalescedA: [ 1, -1 ]
- NumLoadsCoalescedB: [ 1, -1 ]
- LoopUnroll: [8, 16]
BenchmarkForkParameters:
JoinParameters:
Expand Down Expand Up @@ -107,8 +107,8 @@ BenchmarkProblems:
- WorkGroupShape: [ 0 ]
- ThreadTileEdge: [2, 4, 6, 8]
- ThreadTileShape: [ 0, 1 ]
- NumLoadsCoalescedA: [ 1, 2, 3, 4, 6, 8 ]
- NumLoadsCoalescedB: [ 1, 2, 3, 4, 6, 8 ]
- NumLoadsCoalescedA: [ 1, -1 ]
- NumLoadsCoalescedB: [ 1, -1 ]
- LoopUnroll: [8, 16]
BenchmarkForkParameters:
JoinParameters:
Expand Down Expand Up @@ -140,8 +140,8 @@ BenchmarkProblems:
- WorkGroupShape: [ 0 ]
- ThreadTileEdge: [2, 4, 6, 8]
- ThreadTileShape: [ 0 ]
- NumLoadsCoalescedA: [ 1, 2, 3, 4, 6, 8 ]
- NumLoadsCoalescedB: [ 1, 2, 3, 4, 6, 8 ]
- NumLoadsCoalescedA: [ 1, -1 ]
- NumLoadsCoalescedB: [ 1, -1 ]
- LoopUnroll: [8, 16]
BenchmarkForkParameters:
JoinParameters:
Expand All @@ -151,7 +151,6 @@ BenchmarkProblems:
- ProblemSizes: [ [32, 32, 32, 4000], [32, 32, 32, 4000], [2], [2880] ]

LibraryLogic:
Dilation: 3
Threshold: 0.1
BranchPenalty: 0

LibraryClient:
Loading

0 comments on commit 48610a1

Please sign in to comment.