Skip to content

Commit

Permalink
simplify outputs
Browse files Browse the repository at this point in the history
  • Loading branch information
jiazhihao committed Jan 26, 2019
1 parent f84d103 commit d1ddcc5
Show file tree
Hide file tree
Showing 20 changed files with 63 additions and 91 deletions.
6 changes: 3 additions & 3 deletions code/Makefile
Original file line number Diff line number Diff line change
Expand Up @@ -12,9 +12,9 @@ LDFLAGS_GPU = -lcudnn -lcublas
LDFLAGS_CPU = -L$(MKLPATH)/lib -lmklml_intel -liomp5
LDFLAGS_TRT = $(LDFLAGS_GPU) -L$(TRTPATH)/lib -lnvinfer

BIN_GPU = mf
BIN_CPU = mf_mkl
BIN_TRT = mf_trt
BIN_GPU = ../mf
BIN_CPU = ../mf_mkl
BIN_TRT = ../mf_trt

gpu: $(BIN_GPU)

Expand Down
2 changes: 2 additions & 0 deletions code/activation_cudnn.cu
Original file line number Diff line number Diff line change
Expand Up @@ -127,8 +127,10 @@ void Model::measure_activation_cost(Activation* act)
float milliseconds;
cudaEventElapsedTime(&milliseconds, startEvent, endEvent);
act->runtime = milliseconds / REPEAT_TIMES;
#ifdef VERBOSE
printf("measure[Activation]: i(%d %d %d %d) type(%d) cost(%.4lf)\n",
act->inputs[0].dim[0], act->inputs[0].dim[1], act->inputs[0].dim[2],
act->inputs[0].dim[3], act->type, act->runtime);
#endif
}

25 changes: 12 additions & 13 deletions code/cnn.cc
Original file line number Diff line number Diff line change
Expand Up @@ -54,8 +54,8 @@ Graph* optimize_graph(Graph *graph, Model *model, float alpha, int budget)
hashmap.insert(graph->hash());
Graph *bestGraph = graph;
float bestCost = graph->total_cost();
printf("baselineCost = %.4lfms\n", bestCost);
printf("baselineGraph: end-to-end runtime = %.4lfms\n", graph->run(model));
//printf("baselineCost = %.4lfms\n", bestCost);
printf("Baseline Graph:\n End-to-end runtime = %.4lfms\n", graph->run(model));
graph->print_costs();

int counter = 0;
Expand All @@ -67,21 +67,26 @@ Graph* optimize_graph(Graph *graph, Model *model, float alpha, int budget)
bestCost = subGraph->total_cost();
bestGraph = subGraph;
}
if (subGraph->total_cost() > alpha * bestCost) {
delete subGraph;
continue;
}
if (counter > budget) {
// TODO: free all remaining candidates when budget exhausted
break;
}
#ifdef VERBOSE
if (counter % 100 == 0)
printf("[%d] cost = %.4lf bestCost = %.4lf candidates.size() = %zu\n", counter, subGraph->total_cost(), bestCost, candidates.size());
#endif
counter ++;
for (int i = 0; i < xfers.size(); i++)
xfers[i]->run(0, subGraph, candidates, hashmap, bestCost * alpha);
if (bestGraph != subGraph) {
delete subGraph;
}
}
printf("bestCost = %.4lf\n", bestCost);
printf("bestGraph: end-to-end runtime = %.2lf\n", bestGraph->run(model));
printf("Optimized Graph:\n End-to-end runtime = %.4lfms\n", bestGraph->run(model));
bestGraph->print_costs();

return bestGraph;
Expand All @@ -106,6 +111,7 @@ DNNModel name_to_model(std::string name)
if (name == "resnet50") return Resnet50;
if (name == "densenet") return DenseNet;
if (name == "rnntc") return RNNTC;
assert(false);
}

void parse_args(bool &optimize,
Expand Down Expand Up @@ -181,8 +187,8 @@ int main(int argc, char **argv)
{
bool optimize = true;
bool export_graph = false;
int budget = 1024 * 2; // 2K candidates
float alpha = 1.05;
int budget = 300; // 300 candidates
float alpha = 1.01;
DNNModel dnn = None;
std::string export_file_name;
parse_args(optimize, export_graph, alpha, budget, export_file_name, dnn, argc, argv);
Expand Down Expand Up @@ -210,9 +216,6 @@ int main(int argc, char **argv)
case RNNTC:
graph = RNNTC_SRU(model);
break;
case NMT:
graph = NMT_SRU(model);
break;
default:
assert(false);
}
Expand All @@ -226,10 +229,6 @@ int main(int argc, char **argv)
graph = RNNTC_OPT(model);
printf("bestGraph: end-to-end runtime = %.2lf\n", graph->run(model));
graph->print_costs();
} else if (optimize && dnn == NMT) {
graph = NMT_OPT(model);
printf("bestGraph: end-to-end runtime = %.2lf\n", graph->run(model));
graph->print_costs();
} else if (optimize) {
graph = optimize_graph(graph, model, alpha, budget);
}
Expand Down
2 changes: 2 additions & 0 deletions code/concat_cudnn.cu
Original file line number Diff line number Diff line change
Expand Up @@ -64,6 +64,8 @@ void Model::measure_concat_cost(Concat* concat)
float milliseconds;
cudaEventElapsedTime(&milliseconds, startEvent, endEvent);
concat->runtime = milliseconds / REPEAT_TIMES;
#ifdef VERBOSE
printf("measure[Concat]: cost(%.4lf)\n", concat->runtime);
#endif
}

4 changes: 4 additions & 0 deletions code/conv2d.cc
Original file line number Diff line number Diff line change
Expand Up @@ -26,7 +26,9 @@ Tensor Graph::conv2d(Tensor _input, int _outputC,
inEdges[op];
outEdges[op];
Edge in(_input.idx, _input.op), out(_input.idx, op);
#ifdef VERBOSE
printf("inEdges[guid = %zu ptr = %p]\n", op.guid, op.ptr);
#endif
inEdges[op].insert(in);
outEdges[_input.op].insert(out);
Tensor t = op.ptr->outputs[0];
Expand Down Expand Up @@ -68,8 +70,10 @@ Conv2D::Conv2D(Model* _model, Tensor _input, int _outputC,
padH(_padH), padW(_padW), relu(_relu)
{
assert(_input.numDim == 4);
#ifdef VERBOSE
printf("k(%d %d) pad(%d %d) stride(%d %d)\n",
kernelH, kernelW, padH, padW, strideH, strideW);
#endif
int inputH = _input.dim[2];
int inputW = _input.dim[3];
int outputH = 1 + (inputH + 2 * padH - kernelH) / strideH;
Expand Down
4 changes: 4 additions & 0 deletions code/conv2d_cudnn.cu
Original file line number Diff line number Diff line change
Expand Up @@ -148,10 +148,12 @@ void Model::measure_conv2d_cost(Conv2D* conv)
workSpace, workSpaceSize));
assert(cnt > 0);
checkCUDNN(perfResults[0].status);
#ifdef VERBOSE
for (int i = 0; i < cnt; i++) {
printf("fwdAlgo(%d) time(%.2lfms) space(%zuMB)\n", perfResults[i].algo,
perfResults[i].time, perfResults[i].memory / 1024 / 1024);
}
#endif
conv->fwdAlgo = perfResults[0].algo;

checkCUDA(cudaDeviceSynchronize());
Expand All @@ -177,8 +179,10 @@ void Model::measure_conv2d_cost(Conv2D* conv)
float milliseconds;
cudaEventElapsedTime(&milliseconds, startEvent, endEvent);
conv->runtime = milliseconds / REPEAT_TIMES;
#ifdef VERBOSE
printf("measure[Conv2D]: i(%d %d %d %d) o(%d) k(%d %d) s(%d %d) p(%d %d) cost(%.4lf)\n",
BATCH_SIZE, inputC, inputH, inputW, outputC, conv->kernelH, conv->kernelW,
conv->strideH, conv->strideW, conv->padH, conv->padW, conv->runtime);
#endif
}

4 changes: 4 additions & 0 deletions code/element.cc
Original file line number Diff line number Diff line change
Expand Up @@ -33,7 +33,9 @@ Tensor Graph::add(Tensor t1, Tensor t2)
inEdges[op].insert(in);
outEdges[t2.op].insert(out);
}
#ifdef VERBOSE
printf("inEdges[guid = %zu ptr = %p]\n", op.guid, op.ptr);
#endif
Tensor t = op.ptr->outputs[0];
t.op = op;
return t;
Expand All @@ -57,7 +59,9 @@ Tensor Graph::mul(Tensor t1, Tensor t2)
inEdges[op].insert(in);
outEdges[t2.op].insert(out);
}
#ifdef VERBOSE
printf("inEdges[guid = %zu ptr = %p]\n", op.guid, op.ptr);
#endif
Tensor t = op.ptr->outputs[0];
t.op = op;
return t;
Expand Down
2 changes: 2 additions & 0 deletions code/element_cudnn.cu
Original file line number Diff line number Diff line change
Expand Up @@ -100,8 +100,10 @@ void Model::measure_element_cost(Element* ele)
float milliseconds;
cudaEventElapsedTime(&milliseconds, startEvent, endEvent);
ele->runtime = milliseconds / REPEAT_TIMES;
#ifdef VERBOSE
printf("measure[Element]: i(%d %d %d %d) type(%d) cost(%.4lf)\n",
ele->inputs[0].dim[0], ele->inputs[0].dim[1], ele->inputs[0].dim[2],
ele->inputs[0].dim[3], ele->type, ele->runtime);
#endif
}

5 changes: 3 additions & 2 deletions code/graph_to_trt.cc
Original file line number Diff line number Diff line change
Expand Up @@ -343,15 +343,16 @@ void runGraphTRT(Graph *graph) {
}

total /= numberRun;
std::cout << "Average over " << numberRun << " runs is " << total << " ms." << std::endl;
std::cout << "Optimized Graph on TensorRT:" << std::endl;
std::cout << " Average over " << numberRun << " runs is " << total << " ms." << std::endl;

for (int bindingIdx = 0; bindingIdx < nbBindings; ++bindingIdx) {
checkCUDA(cudaFree(buffers[bindingIdx]));
}

context->destroy();
engine->destroy();
gProfiler.printLayerTimes();
//gProfiler.printLayerTimes();
}

#endif
1 change: 1 addition & 0 deletions code/inception.h
Original file line number Diff line number Diff line change
Expand Up @@ -111,6 +111,7 @@ Tensor inceptionE(Graph* graph, Tensor input)

Graph* InceptionV3(Model* model)
{
printf("Create InceptionV3 graph.\n");
Graph *graph = new Graph(model);
Tensor input;
input.numDim = 4;
Expand Down
2 changes: 2 additions & 0 deletions code/matmul_cudnn.cu
Original file line number Diff line number Diff line change
Expand Up @@ -126,8 +126,10 @@ void Model::measure_matmul_cost(Matmul* mm)
float milliseconds;
cudaEventElapsedTime(&milliseconds, startEvent, endEvent);
mm->runtime = milliseconds / REPEAT_TIMES;
#ifdef VERBOSE
printf("measure[Matmul]: i(%d %d %d) o(%d) acti(%d) cost(%.4lf)\n",
mm->inputs[0].dim[0], mm->inputs[0].dim[1], inputC, outputC,
mm->actiMode, mm->runtime);
#endif
}

12 changes: 8 additions & 4 deletions code/ops.cc
Original file line number Diff line number Diff line change
Expand Up @@ -474,9 +474,13 @@ void Graph::print_costs(void)
std::map<Op, std::set<Edge, EdgeCompare>, OpCompare>::const_iterator it;
for (it = inEdges.begin(); it != inEdges.end(); it++)
it->first.ptr->collect_costs(exe_time, flops, mem_acc, num_kernels);
printf("Cost metrics: exe_time(%.4lf) flops(%.4lf) "
"memory_access(%.4lf) kernel_launches(%d)\n",
exe_time, flops / 1024.0 / 1024.0 / 1024.0,
mem_acc * 4.0 / 1024.0 / 1024.0, num_kernels);
printf(" Estimated runtime = %.4lf ms\n", exe_time);
printf(" Floating point operations = %.4lf Gflop\n", flops / 1024 / 1024 / 1024);
printf(" Memory accesses = %.4lf MB\n", mem_acc * 4.0 / 1024 / 1024);
printf(" GPU kernel launches = %d\n", num_kernels);
//printf("Cost metrics: exe_time(%.4lf) flops(%.4lf) "
// "memory_access(%.4lf) kernel_launches(%d)\n",
// exe_time, flops / 1024.0 / 1024.0 / 1024.0,
// mem_acc * 4.0 / 1024.0 / 1024.0, num_kernels);
}

2 changes: 1 addition & 1 deletion code/ops_cudnn.cu
Original file line number Diff line number Diff line change
Expand Up @@ -25,7 +25,7 @@ Model::Model(bool training)
workSpaceSize = WORK_SPACE_SIZE;
global_unique_id = 100;
checkCUDA(cudaMalloc(&workSpace, workSpaceSize));
printf("handle.workSpace = 0x%p\n", workSpace);
//printf("handle.workSpace = 0x%p\n", workSpace);
// create all descriptors
checkCUDNN(cudnnCreateTensorDescriptor(&inputTensor));
checkCUDNN(cudnnCreateTensorDescriptor(&biasTensor));
Expand Down
2 changes: 2 additions & 0 deletions code/pool2d.cc
Original file line number Diff line number Diff line change
Expand Up @@ -91,8 +91,10 @@ Pool2D::Pool2D(Model* _model, Tensor _input, OpType _type,
int inputW = _input.dim[3];
int outputH = 1 + (inputH + 2 * padH - kernelH) / strideH;
int outputW = 1 + (inputW + 2 * padW - kernelW) / strideW;
#ifdef VERBOSE
printf("k(%d %d) pad(%d %d) s(%d %d)\n",
kernelH, kernelW, padH, padW, strideH, strideW);
#endif
numOutputs = 1;
outputs[0].numDim = 4;
outputs[0].dim[0] = BATCH_SIZE;
Expand Down
2 changes: 2 additions & 0 deletions code/pool2d_cudnn.cu
Original file line number Diff line number Diff line change
Expand Up @@ -131,8 +131,10 @@ void Model::measure_pool2d_cost(Pool2D* pool)
float milliseconds;
cudaEventElapsedTime(&milliseconds, startEvent, endEvent);
pool->runtime = milliseconds / REPEAT_TIMES;
#ifdef VERBOSE
printf("measure[Pool2D]: i(%d %d %d %d) k(%d %d) s(%d %d) p(%d %d) cost(%.4lf)\n",
BATCH_SIZE, inputC, inputH, inputW, pool->kernelH, pool->kernelW,
pool->strideH, pool->strideW, pool->padH, pool->padW, pool->runtime);
#endif
}

1 change: 1 addition & 0 deletions code/resnet.h
Original file line number Diff line number Diff line change
Expand Up @@ -56,6 +56,7 @@ Tensor BottleneckBlock(Graph* graph, Tensor input, int outChannels,

Graph* ResNet34(Model* model)
{
printf("Create ResNet-34 graph.\n");
Graph *graph = new Graph(model);
Tensor input;
input.numDim = 4;
Expand Down
2 changes: 2 additions & 0 deletions code/split.cc
Original file line number Diff line number Diff line change
Expand Up @@ -130,7 +130,9 @@ void Model::measure_split_cost(Split* split)
{
// We assume split cost is zero
split->runtime = 0;
#ifdef VERBOSE
printf("measure[split]: cost(%.4lf)\n", split->runtime);
#endif
}

// key is (inputN, input H, inputW, n, outputC[0...,n-1]
Expand Down
1 change: 1 addition & 0 deletions code/squeezenet.h
Original file line number Diff line number Diff line change
Expand Up @@ -38,6 +38,7 @@ Tensor fire_complex(Graph* graph, Tensor input, int squeeze, int expand)

Graph* SqueezeNetComplex(Model* model)
{
printf("Create SqueezeNet graph with complex bypass.\n");
Graph *graph = new Graph(model);
Tensor input;
input.numDim = 4;
Expand Down
69 changes: 1 addition & 68 deletions code/sru.h
Original file line number Diff line number Diff line change
Expand Up @@ -50,6 +50,7 @@ SRUTensors SRUOpt(Graph* graph, Tensor x, Tensor c)

Graph* RNNTC_SRU(Model* model)
{
printf("Create RNN Text Classification Graph.\n");
const int LENGTH = 20;
const int NUM_LAYERS = 1;
Graph *graph = new Graph(model);
Expand Down Expand Up @@ -106,72 +107,4 @@ Graph* RNNTC_OPT(Model* model)
return graph;
}

Graph* NMT_SRU(Model* model)
{
const int LENGTH = 40;
const int NUM_LAYERS = 2;
Graph *graph = new Graph(model);
Tensor input;
input.numDim = 3;
input.dim[0] = 1;
input.dim[1] = BATCH_SIZE;
input.dim[2] = EMBED_SIZE;
input.dim[3] = 0;
input.op.guid = 0;
input.op.ptr = NULL;
input.idx = 0;
Tensor xs[LENGTH];
for (int i = 0; i < LENGTH; i++) {
xs[i] = graph->noop(input);
}
Tensor c = graph->noop(input);
SRUTensors sru[NUM_LAYERS][LENGTH];
for (int i = 0; i < LENGTH; i++) {
for (int j = 0; j < NUM_LAYERS; j++) {
Tensor x_in;
if (i < LENGTH / 2)
x_in = (j==0) ? xs[i] : sru[j-1][i].h;
else
x_in = (j==0) ? sru[NUM_LAYERS-1][i-1].h : sru[j-1][i].h;
Tensor c_in = (i==0) ? c : sru[j][i-1].c;
sru[j][i] = SRUNode(graph, x_in, c_in);
}
}
return graph;
}

Graph* NMT_OPT(Model* model)
{
const int LENGTH = 40;
const int NUM_LAYERS = 2;
Graph *graph = new Graph(model);
Tensor input;
input.numDim = 3;
input.dim[0] = 1;
input.dim[1] = BATCH_SIZE;
input.dim[2] = EMBED_SIZE;
input.dim[3] = 0;
input.op.guid = 0;
input.op.ptr = NULL;
input.idx = 0;
Tensor xs[LENGTH];
for (int i = 0; i < LENGTH; i++) {
xs[i] = graph->noop(input);
}
Tensor c = graph->noop(input);
SRUTensors sru[NUM_LAYERS][LENGTH];
for (int i = 0; i < LENGTH; i++) {
for (int j = 0; j < NUM_LAYERS; j++) {
Tensor x_in;
if (i < LENGTH / 2)
x_in = (j==0) ? xs[i] : sru[j-1][i].h;
else
x_in = (j==0) ? sru[NUM_LAYERS-1][i-1].h : sru[j-1][i].h;
Tensor c_in = (i==0) ? c : sru[j][i-1].c;
sru[j][i] = SRUOpt(graph, x_in, c_in);
}
}
return graph;
}

#endif
6 changes: 6 additions & 0 deletions install.sh
Original file line number Diff line number Diff line change
@@ -0,0 +1,6 @@
#!/bin/bash

cd code
make clean
make gpu
make trt

0 comments on commit d1ddcc5

Please sign in to comment.