simplify outputs

jiazhihao · Jan 26, 2019 · d1ddcc5 · d1ddcc5
1 parent f84d103
commit d1ddcc5
Show file tree

Hide file tree

Showing 20 changed files with 63 additions and 91 deletions.
diff --git a/code/Makefile b/code/Makefile
@@ -12,9 +12,9 @@ LDFLAGS_GPU = -lcudnn -lcublas
 LDFLAGS_CPU = -L$(MKLPATH)/lib -lmklml_intel -liomp5
 LDFLAGS_TRT = $(LDFLAGS_GPU) -L$(TRTPATH)/lib -lnvinfer
 
-BIN_GPU = mf
-BIN_CPU = mf_mkl
-BIN_TRT = mf_trt
+BIN_GPU = ../mf
+BIN_CPU = ../mf_mkl
+BIN_TRT = ../mf_trt
 
 gpu: $(BIN_GPU)
 

diff --git a/code/activation_cudnn.cu b/code/activation_cudnn.cu
@@ -127,8 +127,10 @@ void Model::measure_activation_cost(Activation* act)
   float milliseconds;
   cudaEventElapsedTime(&milliseconds, startEvent, endEvent);
   act->runtime = milliseconds / REPEAT_TIMES;
+#ifdef VERBOSE
   printf("measure[Activation]: i(%d %d %d %d) type(%d) cost(%.4lf)\n",
          act->inputs[0].dim[0], act->inputs[0].dim[1], act->inputs[0].dim[2],
          act->inputs[0].dim[3], act->type, act->runtime);
+#endif
 }
 
diff --git a/code/cnn.cc b/code/cnn.cc
@@ -54,8 +54,8 @@ Graph* optimize_graph(Graph *graph, Model *model, float alpha, int budget)
   hashmap.insert(graph->hash());
   Graph *bestGraph = graph;
   float bestCost = graph->total_cost();
-  printf("baselineCost = %.4lfms\n", bestCost);
-  printf("baselineGraph: end-to-end runtime = %.4lfms\n", graph->run(model));
+  //printf("baselineCost = %.4lfms\n", bestCost);
+  printf("Baseline Graph:\n    End-to-end runtime = %.4lfms\n", graph->run(model));
   graph->print_costs();
 
   int counter = 0;
@@ -67,21 +67,26 @@ Graph* optimize_graph(Graph *graph, Model *model, float alpha, int budget)
       bestCost = subGraph->total_cost();
       bestGraph = subGraph;
     }
+    if (subGraph->total_cost() > alpha * bestCost) {
+      delete subGraph;
+      continue;
+    }
     if (counter > budget) {
       // TODO: free all remaining candidates when budget exhausted 
       break;
     }
+#ifdef VERBOSE
     if (counter % 100 == 0)
       printf("[%d] cost = %.4lf bestCost = %.4lf candidates.size() = %zu\n", counter, subGraph->total_cost(), bestCost, candidates.size());
+#endif
     counter ++;
     for (int i = 0; i < xfers.size(); i++)
       xfers[i]->run(0, subGraph, candidates, hashmap, bestCost * alpha);
     if (bestGraph != subGraph) {
       delete subGraph;
     }
   }
-  printf("bestCost = %.4lf\n", bestCost);
-  printf("bestGraph: end-to-end runtime = %.2lf\n", bestGraph->run(model));
+  printf("Optimized Graph:\n    End-to-end runtime = %.4lfms\n", bestGraph->run(model));
   bestGraph->print_costs();
 
   return bestGraph;
@@ -106,6 +111,7 @@ DNNModel name_to_model(std::string name)
   if (name == "resnet50") return Resnet50;
   if (name == "densenet") return DenseNet;
   if (name == "rnntc") return RNNTC;
+  assert(false);
 }
 
 void parse_args(bool &optimize,
@@ -181,8 +187,8 @@ int main(int argc, char **argv)
 {
   bool optimize = true;
   bool export_graph = false;
-  int budget = 1024 * 2; // 2K candidates
-  float alpha = 1.05;
+  int budget = 300; // 300 candidates
+  float alpha = 1.01;
   DNNModel dnn = None;
   std::string export_file_name;
   parse_args(optimize, export_graph, alpha, budget, export_file_name, dnn, argc, argv);
@@ -210,9 +216,6 @@ int main(int argc, char **argv)
     case RNNTC:
       graph = RNNTC_SRU(model);
       break;
-    case NMT:
-      graph = NMT_SRU(model);
-      break;
     default:
       assert(false);
   }
@@ -226,10 +229,6 @@ int main(int argc, char **argv)
     graph = RNNTC_OPT(model);
     printf("bestGraph: end-to-end runtime = %.2lf\n", graph->run(model));
     graph->print_costs();
-  } else if (optimize && dnn == NMT) {
-    graph = NMT_OPT(model);
-    printf("bestGraph: end-to-end runtime = %.2lf\n", graph->run(model));
-    graph->print_costs();
   } else if (optimize) {
     graph = optimize_graph(graph, model, alpha, budget);
   }

diff --git a/code/concat_cudnn.cu b/code/concat_cudnn.cu
@@ -64,6 +64,8 @@ void Model::measure_concat_cost(Concat* concat)
   float milliseconds;
   cudaEventElapsedTime(&milliseconds, startEvent, endEvent);
   concat->runtime = milliseconds / REPEAT_TIMES;
+#ifdef VERBOSE
   printf("measure[Concat]: cost(%.4lf)\n", concat->runtime);
+#endif
 }
 
diff --git a/code/conv2d.cc b/code/conv2d.cc
@@ -26,7 +26,9 @@ Tensor Graph::conv2d(Tensor _input, int _outputC,
   inEdges[op];
   outEdges[op];
   Edge in(_input.idx, _input.op), out(_input.idx, op);
+#ifdef VERBOSE
   printf("inEdges[guid = %zu ptr = %p]\n", op.guid, op.ptr);
+#endif
   inEdges[op].insert(in);
   outEdges[_input.op].insert(out);
   Tensor t = op.ptr->outputs[0];
@@ -68,8 +70,10 @@ Conv2D::Conv2D(Model* _model, Tensor _input, int _outputC,
   padH(_padH), padW(_padW), relu(_relu)
 {
   assert(_input.numDim == 4);
+#ifdef VERBOSE
   printf("k(%d %d) pad(%d %d) stride(%d %d)\n",
          kernelH, kernelW, padH, padW, strideH, strideW);
+#endif
   int inputH = _input.dim[2];
   int inputW = _input.dim[3];
   int outputH = 1 + (inputH + 2 * padH - kernelH) / strideH;

diff --git a/code/conv2d_cudnn.cu b/code/conv2d_cudnn.cu
@@ -148,10 +148,12 @@ void Model::measure_conv2d_cost(Conv2D* conv)
       workSpace, workSpaceSize));
   assert(cnt > 0);
   checkCUDNN(perfResults[0].status);
+#ifdef VERBOSE
   for (int i = 0; i < cnt; i++) {
     printf("fwdAlgo(%d) time(%.2lfms) space(%zuMB)\n", perfResults[i].algo,
            perfResults[i].time, perfResults[i].memory / 1024 / 1024);
   }
+#endif
   conv->fwdAlgo = perfResults[0].algo;
 
   checkCUDA(cudaDeviceSynchronize());
@@ -177,8 +179,10 @@ void Model::measure_conv2d_cost(Conv2D* conv)
   float milliseconds;
   cudaEventElapsedTime(&milliseconds, startEvent, endEvent);
   conv->runtime = milliseconds / REPEAT_TIMES;
+#ifdef VERBOSE
   printf("measure[Conv2D]: i(%d %d %d %d) o(%d) k(%d %d) s(%d %d) p(%d %d) cost(%.4lf)\n",
          BATCH_SIZE, inputC, inputH, inputW, outputC, conv->kernelH, conv->kernelW,
          conv->strideH, conv->strideW, conv->padH, conv->padW, conv->runtime);
+#endif
 }
 
diff --git a/code/element.cc b/code/element.cc
@@ -33,7 +33,9 @@ Tensor Graph::add(Tensor t1, Tensor t2)
     inEdges[op].insert(in);
     outEdges[t2.op].insert(out);
   }
+#ifdef VERBOSE
   printf("inEdges[guid = %zu ptr = %p]\n", op.guid, op.ptr);
+#endif
   Tensor t = op.ptr->outputs[0];
   t.op = op;
   return t;
@@ -57,7 +59,9 @@ Tensor Graph::mul(Tensor t1, Tensor t2)
     inEdges[op].insert(in);
     outEdges[t2.op].insert(out);
   }
+#ifdef VERBOSE
   printf("inEdges[guid = %zu ptr = %p]\n", op.guid, op.ptr);
+#endif
   Tensor t = op.ptr->outputs[0];
   t.op = op;
   return t;

diff --git a/code/element_cudnn.cu b/code/element_cudnn.cu
@@ -100,8 +100,10 @@ void Model::measure_element_cost(Element* ele)
   float milliseconds;
   cudaEventElapsedTime(&milliseconds, startEvent, endEvent);
   ele->runtime = milliseconds / REPEAT_TIMES;
+#ifdef VERBOSE
   printf("measure[Element]: i(%d %d %d %d) type(%d) cost(%.4lf)\n",
          ele->inputs[0].dim[0], ele->inputs[0].dim[1], ele->inputs[0].dim[2],
          ele->inputs[0].dim[3], ele->type, ele->runtime);
+#endif
 }
 
diff --git a/code/graph_to_trt.cc b/code/graph_to_trt.cc
@@ -343,15 +343,16 @@ void runGraphTRT(Graph *graph) {
   }
 
   total /= numberRun;
-  std::cout << "Average over " << numberRun << " runs is " << total << " ms." << std::endl;
+  std::cout << "Optimized Graph on TensorRT:" << std::endl;
+  std::cout << "    Average over " << numberRun << " runs is " << total << " ms." << std::endl;
 
   for (int bindingIdx = 0; bindingIdx < nbBindings; ++bindingIdx) {
     checkCUDA(cudaFree(buffers[bindingIdx]));
   }
 
   context->destroy();
   engine->destroy();
-  gProfiler.printLayerTimes();
+  //gProfiler.printLayerTimes();
 }
 
 #endif
diff --git a/code/inception.h b/code/inception.h
@@ -111,6 +111,7 @@ Tensor inceptionE(Graph* graph, Tensor input)
 
 Graph* InceptionV3(Model* model)
 {
+  printf("Create InceptionV3 graph.\n");
   Graph *graph = new Graph(model);
   Tensor input;
   input.numDim = 4;

diff --git a/code/matmul_cudnn.cu b/code/matmul_cudnn.cu
@@ -126,8 +126,10 @@ void Model::measure_matmul_cost(Matmul* mm)
   float milliseconds;
   cudaEventElapsedTime(&milliseconds, startEvent, endEvent);
   mm->runtime = milliseconds / REPEAT_TIMES;
+#ifdef VERBOSE
   printf("measure[Matmul]: i(%d %d %d) o(%d) acti(%d) cost(%.4lf)\n",
          mm->inputs[0].dim[0], mm->inputs[0].dim[1], inputC, outputC,
          mm->actiMode, mm->runtime);
+#endif
 }
 
diff --git a/code/ops.cc b/code/ops.cc
@@ -474,9 +474,13 @@ void Graph::print_costs(void)
   std::map<Op, std::set<Edge, EdgeCompare>, OpCompare>::const_iterator it;
   for (it = inEdges.begin(); it != inEdges.end(); it++)
     it->first.ptr->collect_costs(exe_time, flops, mem_acc, num_kernels);
-  printf("Cost metrics: exe_time(%.4lf) flops(%.4lf) "
-         "memory_access(%.4lf) kernel_launches(%d)\n",
-         exe_time, flops / 1024.0 / 1024.0 / 1024.0,
-         mem_acc * 4.0 / 1024.0 / 1024.0, num_kernels);
+  printf("    Estimated runtime = %.4lf ms\n", exe_time);
+  printf("    Floating point operations = %.4lf Gflop\n", flops / 1024 / 1024 / 1024);
+  printf("    Memory accesses = %.4lf MB\n", mem_acc * 4.0 / 1024 / 1024);
+  printf("    GPU kernel launches = %d\n", num_kernels);
+  //printf("Cost metrics: exe_time(%.4lf) flops(%.4lf) "
+  //       "memory_access(%.4lf) kernel_launches(%d)\n",
+  //       exe_time, flops / 1024.0 / 1024.0 / 1024.0,
+  //       mem_acc * 4.0 / 1024.0 / 1024.0, num_kernels);
 }
 
diff --git a/code/ops_cudnn.cu b/code/ops_cudnn.cu
@@ -25,7 +25,7 @@ Model::Model(bool training)
   workSpaceSize = WORK_SPACE_SIZE;
   global_unique_id = 100;
   checkCUDA(cudaMalloc(&workSpace, workSpaceSize));
-  printf("handle.workSpace = 0x%p\n", workSpace);
+  //printf("handle.workSpace = 0x%p\n", workSpace);
   // create all descriptors
   checkCUDNN(cudnnCreateTensorDescriptor(&inputTensor));
   checkCUDNN(cudnnCreateTensorDescriptor(&biasTensor));

diff --git a/code/pool2d.cc b/code/pool2d.cc
@@ -91,8 +91,10 @@ Pool2D::Pool2D(Model* _model, Tensor _input, OpType _type,
   int inputW = _input.dim[3];
   int outputH = 1 + (inputH + 2 * padH - kernelH) / strideH;
   int outputW = 1 + (inputW + 2 * padW - kernelW) / strideW;
+#ifdef VERBOSE
   printf("k(%d %d) pad(%d %d) s(%d %d)\n",
          kernelH, kernelW, padH, padW, strideH, strideW);
+#endif
   numOutputs = 1;
   outputs[0].numDim = 4;
   outputs[0].dim[0] = BATCH_SIZE;

diff --git a/code/pool2d_cudnn.cu b/code/pool2d_cudnn.cu
@@ -131,8 +131,10 @@ void Model::measure_pool2d_cost(Pool2D* pool)
   float milliseconds;
   cudaEventElapsedTime(&milliseconds, startEvent, endEvent);
   pool->runtime = milliseconds / REPEAT_TIMES;
+#ifdef VERBOSE
   printf("measure[Pool2D]: i(%d %d %d %d) k(%d %d) s(%d %d) p(%d %d) cost(%.4lf)\n",
          BATCH_SIZE, inputC, inputH, inputW, pool->kernelH, pool->kernelW,
          pool->strideH, pool->strideW, pool->padH, pool->padW, pool->runtime);
+#endif
 }
 
diff --git a/code/resnet.h b/code/resnet.h
@@ -56,6 +56,7 @@ Tensor BottleneckBlock(Graph* graph, Tensor input, int outChannels,
 
 Graph* ResNet34(Model* model)
 {
+  printf("Create ResNet-34 graph.\n");
   Graph *graph = new Graph(model);
   Tensor input;
   input.numDim = 4;

diff --git a/code/split.cc b/code/split.cc
@@ -130,7 +130,9 @@ void Model::measure_split_cost(Split* split)
 {
   // We assume split cost is zero
   split->runtime = 0;
+#ifdef VERBOSE
   printf("measure[split]: cost(%.4lf)\n", split->runtime);
+#endif
 }
 
 // key is (inputN, input H, inputW, n, outputC[0...,n-1]

diff --git a/code/squeezenet.h b/code/squeezenet.h
@@ -38,6 +38,7 @@ Tensor fire_complex(Graph* graph, Tensor input, int squeeze, int expand)
 
 Graph* SqueezeNetComplex(Model* model)
 {
+  printf("Create SqueezeNet graph with complex bypass.\n");
   Graph *graph = new Graph(model);
   Tensor input;
   input.numDim = 4;

diff --git a/code/sru.h b/code/sru.h
@@ -50,6 +50,7 @@ SRUTensors SRUOpt(Graph* graph, Tensor x, Tensor c)
 
 Graph* RNNTC_SRU(Model* model)
 {
+  printf("Create RNN Text Classification Graph.\n");
   const int LENGTH = 20;
   const int NUM_LAYERS = 1;
   Graph *graph = new Graph(model);
@@ -106,72 +107,4 @@ Graph* RNNTC_OPT(Model* model)
   return graph;
 }
 
-Graph* NMT_SRU(Model* model)
-{
-  const int LENGTH = 40;
-  const int NUM_LAYERS = 2;
-  Graph *graph = new Graph(model);
-  Tensor input;
-  input.numDim = 3;
-  input.dim[0] = 1;
-  input.dim[1] = BATCH_SIZE;
-  input.dim[2] = EMBED_SIZE;
-  input.dim[3] = 0;
-  input.op.guid = 0;
-  input.op.ptr = NULL;
-  input.idx = 0;
-  Tensor xs[LENGTH];
-  for (int i = 0; i < LENGTH; i++) {
-    xs[i] = graph->noop(input);
-  }
-  Tensor c = graph->noop(input);
-  SRUTensors sru[NUM_LAYERS][LENGTH];
-  for (int i = 0; i < LENGTH; i++) {
-    for (int j = 0; j < NUM_LAYERS; j++) {
-      Tensor x_in;
-      if (i < LENGTH / 2)
-        x_in = (j==0) ? xs[i] : sru[j-1][i].h;
-      else
-        x_in = (j==0) ? sru[NUM_LAYERS-1][i-1].h : sru[j-1][i].h;
-      Tensor c_in = (i==0) ? c : sru[j][i-1].c;
-      sru[j][i] = SRUNode(graph, x_in, c_in);
-    }
-  }
-  return graph;
-}
-
-Graph* NMT_OPT(Model* model)
-{
-  const int LENGTH = 40;
-  const int NUM_LAYERS = 2;
-  Graph *graph = new Graph(model);
-  Tensor input;
-  input.numDim = 3;
-  input.dim[0] = 1;
-  input.dim[1] = BATCH_SIZE;
-  input.dim[2] = EMBED_SIZE;
-  input.dim[3] = 0;
-  input.op.guid = 0;
-  input.op.ptr = NULL;
-  input.idx = 0;
-  Tensor xs[LENGTH];
-  for (int i = 0; i < LENGTH; i++) {
-    xs[i] = graph->noop(input);
-  }
-  Tensor c = graph->noop(input);
-  SRUTensors sru[NUM_LAYERS][LENGTH];
-  for (int i = 0; i < LENGTH; i++) {
-    for (int j = 0; j < NUM_LAYERS; j++) {
-      Tensor x_in;
-      if (i < LENGTH / 2)
-        x_in = (j==0) ? xs[i] : sru[j-1][i].h;
-      else
-        x_in = (j==0) ? sru[NUM_LAYERS-1][i-1].h : sru[j-1][i].h;
-      Tensor c_in = (i==0) ? c : sru[j][i-1].c;
-      sru[j][i] = SRUOpt(graph, x_in, c_in);
-    }
-  }
-  return graph;
-}
-
 #endif
diff --git a/install.sh b/install.sh
@@ -0,0 +1,6 @@
+#!/bin/bash
+
+cd code
+make clean
+make gpu
+make trt