From 251634fd459a09736b9ed2ed07c807fad744d467 Mon Sep 17 00:00:00 2001 From: carwin Date: Thu, 10 Nov 2016 12:19:46 +0800 Subject: [PATCH] add NNPACK support for high convolution inference perf (#3666) * add NNPACK support for high convolution inference perf * set USE_NNPACK to 0 * Fix header declaration * Fix input_size init value 1. data's shape is BxCxHxW, input_size is {width,height} 2. improve algorithm selection policy * Fix lint error --- Makefile | 6 + make/config.mk | 4 + src/operator/convolution.cc | 15 +++ src/operator/nnpack/nnpack_convolution-inl.h | 117 +++++++++++++++++++ 4 files changed, 142 insertions(+) create mode 100644 src/operator/nnpack/nnpack_convolution-inl.h diff --git a/Makefile b/Makefile index 422f1bd62ed4..3ef5661d1024 100644 --- a/Makefile +++ b/Makefile @@ -58,6 +58,12 @@ ifeq ($(USE_OPENMP), 1) CFLAGS += -fopenmp endif +ifeq ($(USE_NNPACK), 1) + CFLAGS += -DMXNET_USE_NNPACK=1 + CFLAGS += -DMXNET_USE_NNPACK_NUM_THREADS=$(USE_NNPACK_NUM_THREADS) + LDFLAGS += -lnnpack +endif + ifeq ($(USE_MKL2017), 1) CFLAGS += -DMXNET_USE_MKL2017=1 CFLAGS += -DUSE_MKL=1 diff --git a/make/config.mk b/make/config.mk index 9b6c6755c6fc..44fa4d984878 100644 --- a/make/config.mk +++ b/make/config.mk @@ -65,6 +65,10 @@ USE_MKL2017 = 0 # whether use MKL2017 experimental feature for high performance USE_MKL2017_EXPERIMENTAL = 0 +# whether use NNPACK library +USE_NNPACK = 0 +USE_NNPACK_NUM_THREADS = 4 + # choose the version of blas you want to use # can be: mkl, blas, atlas, openblas # in default use atlas for linux while apple for osx diff --git a/src/operator/convolution.cc b/src/operator/convolution.cc index ee37a238c905..0faf2163d1a1 100644 --- a/src/operator/convolution.cc +++ b/src/operator/convolution.cc @@ -11,6 +11,9 @@ #include "./mkl/mkl_memory-inl.h" #include "./mkl/mkl_convolution-inl.h" #endif // MXNET_USE_MKL2017 +#if MXNET_USE_NNPACK == 1 +#include "./nnpack/nnpack_convolution-inl.h" +#endif // MXNET_USE_NNPACK namespace mxnet { namespace op { @@ -32,6 +35,18 @@ Operator* CreateOp(ConvolutionParam param, int dtype, break; } } +#endif +#if MXNET_USE_NNPACK == 1 + if ((param.dilate[0] == 1 && param.dilate[1] == 1) + && param.kernel.ndim() == 2 && (!param.no_bias) + && param.num_group == 1) { + switch (dtype) { + case mshadow::kFloat32: + return new NNPACKConvolutionOp(param); + default: + break; + } + } #endif MSHADOW_REAL_TYPE_SWITCH(dtype, DType, { op = new ConvolutionOp(param); diff --git a/src/operator/nnpack/nnpack_convolution-inl.h b/src/operator/nnpack/nnpack_convolution-inl.h new file mode 100644 index 000000000000..f1038bf216d5 --- /dev/null +++ b/src/operator/nnpack/nnpack_convolution-inl.h @@ -0,0 +1,117 @@ +/*! + * Copyright (c) 2016 by Contributors + * \file nnpack_convolution-inl.h + * \brief + * \author Carwin +*/ +#ifndef MXNET_OPERATOR_NNPACK_NNPACK_CONVOLUTION_INL_H_ +#define MXNET_OPERATOR_NNPACK_NNPACK_CONVOLUTION_INL_H_ + +#include +#include +#include +#include +#include +#include +#include +#include +#include "../convolution-inl.h" +#include "nnpack.h" + +namespace mxnet { +namespace op { + +class NNPACKInitialize { + public: + pthreadpool_t threadpool; + + public: + NNPACKInitialize() { + nnp_status status = nnp_initialize(); + if (nnp_status_success != status) { + LOG(FATAL) << "nnp_initialize failed status=" << status; + } + int num_threads = MXNET_USE_NNPACK_NUM_THREADS; + this->threadpool = pthreadpool_create(num_threads); + } + virtual ~NNPACKInitialize() { + nnp_status status = nnp_deinitialize(); + if (nnp_status_success != status) { + LOG(FATAL) << "nnp_deinitialize failed status=" << status; + } + pthreadpool_destroy(threadpool); + } +}; + +static NNPACKInitialize nnpackinitialize; + +template +class NNPACKConvolutionOp : public ConvolutionOp { + private: + ConvolutionParam param_; + + public: + explicit NNPACKConvolutionOp(ConvolutionParam p) + : ConvolutionOp(p) { + this->param_ = p; + } + + public: + virtual void Forward(const OpContext &ctx, const std::vector &in_data, + const std::vector &req, + const std::vector &out_data, + const std::vector &aux_args) { + using namespace mshadow; + using namespace mshadow::expr; + Stream *s = ctx.get_stream(); + Tensor data = in_data[conv::kData].get(s); + Shape<3> wmat_shape = + Shape3(param_.num_group, param_.num_filter / param_.num_group, + data.shape_[1] / param_.num_group * param_.kernel[0] * + param_.kernel[1]); + Tensor wmat = + in_data[conv::kWeight].get_with_shape(wmat_shape, s); + Tensor out = out_data[conv::kOut].get(s); + + // nnp_convolution_inference optimize for batch_size==1 + // when W or H less than 16, ConvolutionOp fast than nnpack's convolution + if ((data.shape_[0] != 1) || (data.shape_[2] < 16) || + (data.shape_[3] < 16)) { + ConvolutionOp::Forward(ctx, in_data, req, out_data, aux_args); + } else { + nnp_size input_size = {data.shape_[3], data.shape_[2]}; + nnp_padding input_padding = {param_.pad[0], param_.pad[1], param_.pad[0], + param_.pad[1]}; + nnp_size kernel_size = {param_.kernel[1], param_.kernel[0]}; + nnp_size output_subsampling = {param_.stride[1], param_.stride[0]}; + Tensor bias = in_data[conv::kBias].get(s); + + nnp_convolution_algorithm algorithm = nnp_convolution_algorithm_auto; + if ((data.shape_[2] < 32) || (data.shape_[3] < 32)) { + algorithm = nnp_convolution_algorithm_implicit_gemm; + } + + nnp_status status = nnp_convolution_inference( + algorithm, // enum nnp_convolution_algorithm algorithm, + nnp_convolution_transform_strategy_tuple_based, + data.shape_[1], // size_t input_channels, + param_.num_filter, // size_t output_channels, + input_size, // struct nnp_size input_size, + input_padding, // struct nnp_padding input_padding, + kernel_size, // struct nnp_size kernel_size, + output_subsampling, // struct nnp_size output_subsampling, + data.dptr_, // const float input[], + wmat.dptr_, // const float kernel[], + bias.dptr_, // const float bias[], + out.dptr_, // float output[], + nnpackinitialize.threadpool, // pthreadpool_t threadpool, + nullptr); + if (nnp_status_success != status) { + LOG(FATAL) << "nnp_convolution_inference failed status=" << status; + } + } + } +}; // class NNPACKConvolutionOp +} // namespace op +} // namespace mxnet +#endif // MXNET_OPERATOR_NNPACK_NNPACK_CONVOLUTION_INL_H_