bolt v0.3.0 is released

huawei-noah · Jun 5, 2020 · 331a833 · 331a833
1 parent a193d96
commit 331a833
Show file tree

Hide file tree

Showing 598 changed files with 40,759 additions and 13,573 deletions.
diff --git a/CMakeLists.txt b/CMakeLists.txt
@@ -1,4 +1,4 @@
-cmake_minimum_required(VERSION 3.15)
+cmake_minimum_required(VERSION 3.2)
 
 file(GLOB BOLT_CONFIGURE_FILE $ENV{BOLT_ROOT}/bolt.cmake ${BOLT_ROOT}/bolt.cmake)
 if (BOLT_CONFIGURE_FILE)
@@ -20,9 +20,11 @@ add_subdirectory(model-tools)
 add_subdirectory(tensor_computing)
 add_subdirectory(image)
 add_subdirectory(inference)
+add_subdirectory(tools)
+add_subdirectory(kits)
 add_subdirectory(tests)
 add_custom_target(bolt_library ALL
-    COMMAND ./scripts/build_light_bolt.sh ${CMAKE_BINARY_DIR} ${USE_MALI}
+    COMMAND ./CI_SCRIPTS/build_light_bolt.sh ${CMAKE_BINARY_DIR} ${USE_MALI} ${USE_DEBUG} ${USE_LLVM_CLANG} ${CMAKE_CXX_COMPILER} ${CMAKE_AR} ${CMAKE_STRIP}
     WORKING_DIRECTORY $ENV{BOLT_ROOT})
 
 if (USE_MALI)
@@ -38,6 +40,7 @@ add_dependencies(tensor_computing blas-enhance)
 add_dependencies(tensor_computing_static blas-enhance_static)
 add_dependencies(inference tensor_computing model-tools image)
 add_dependencies(inference_static tensor_computing_static model-tools_static image_static)
+add_dependencies(bolt_library inference)
 add_dependencies(bolt_library inference_static)
 
 install(TARGETS blas-enhance blas-enhance_static
@@ -82,22 +85,34 @@ install(DIRECTORY model-tools/tools/tensorflow2caffe
                   model-tools/tools/pytorch2caffe
         DESTINATION tools)
 
-install(TARGETS tensor_computing_library_search
-        RUNTIME DESTINATION tools)
+if (USE_LIBRARY_TUNING)
+    install(TARGETS tensor_computing_library_search
+            RUNTIME DESTINATION tools)
+endif (USE_LIBRARY_TUNING)
 
 if (BUILD_TEST)
+    if (USE_INT8)
+        install(TARGETS ptq_calibration
+                RUNTIME DESTINATION tools)
+    endif(USE_INT8)
     install(TARGETS classification
                     bert
                     tinybert
                     nmt
-            RUNTIME DESTINATION bin)
+                    asr_rnnt
+                    asr_convolution_transformer
+                    tts
+                    vad
+            RUNTIME DESTINATION kits)
 endif(BUILD_TEST)
 
 install(DIRECTORY inference/exports/java
                   inference/exports/c
         DESTINATION include)
 
 install(FILES ${CMAKE_BINARY_DIR}/libBoltModel.so
+              ${CMAKE_BINARY_DIR}/libbolt.a
+              ${CMAKE_BINARY_DIR}/libbolt.so
         DESTINATION lib)
 
 execute_process(COMMAND doxygen .Doxyfile WORKING_DIRECTORY $ENV{BOLT_ROOT})
@@ -107,7 +122,7 @@ enable_testing()
 find_program (BASH_PROGRAM bash)
 
 if (BASH_PROGRAM)
-    set(parameters -b $ENV{BOLT_ROOT}/tests/bin -p /data/local/tmp/uldra)
+    set(parameters -t $ENV{BOLT_ROOT}/tests/bin -k $ENV{BOLT_ROOT}/kits/bin -p /data/local/tmp/uldra)
     if (USE_MALI)
         set(parameters ${parameters} -g)
     endif(USE_MALI)

diff --git a/docs/LICENSE.md → LICENSE.md b/docs/LICENSE.md → LICENSE.md
diff --git a/README.md b/README.md
@@ -8,30 +8,39 @@ Bolt is a light-weight library for mobile devices. Bolt, as a universal deployme
 
 - ### Overview
 
-  Bolt is highly optimized for ARMv8.2 CPUs, supporting fast inference of FP16, INT8 and BNN networks. Recently, FP32 functionality has been integrated, which also works on ARMv8 devices.
+  Bolt has almost supported all the ARM-A devices incude ARMv7/ARMv8/ARMv8.2/Mali-GPU. FP16/BNN for CPU and FP16 for GPU are highly optimized. Bolt also support FP32 on ARMv7/ARMv8/ARMv8.2 devices. 
 
-  Bolt has its own format of model storage, which helps reduce the memory footprint by storing in FP16 and 1-bit representations when possible. We provide model converters for the following formats:
+  Bolt has its own format of model storage, which helps reduce the memory footprint by storing in FP16, INT8 and 1-bit representations when possible. We provide model converters for the following formats:
 
   - caffe
   - onnx
   - tflite
 
-  For PyTorch and TensorFlow models, please try to convert them to the onnx format first. We also had some success in converting these models into customized caffe models.
+  For PyTorch and TensorFlow models, please try to convert them to the onnx or tflite format first. We also had some success in converting these models into customized caffe models.
 
 - ### Verified Networks
 
   Bolt has shown its high performance in the inference of common CV and NLP neural networks. Some of the representative networks that we have verified are listed below. You can find detailed benchmark information in [docs/BENCHMARK.md](docs/BENCHMARK.md).
 
-  - Squeezenet (full-network int8 quantization)
-  - Mobilenet v1 - v3
+  - Squeezenet
+  - Mobilenet v1, v2, v3
   - Resnet50, [Ghostnet](https://github.com/huawei-noah/ghostnet) (plus FPN detection)
   - Birealnet18 (BNN)
+  - SSD(Resnet)
   - Bert, TinyBert, Albert
   - Neural Machine Translation
+  - Automatic Speech Recognition
+  - Text To Speech
+
+  For MALI GPU FP16 Support 
+  - Squeezenet v1.1
+  - Mobilenet v1, v2, v3
+  - Ghostnet
+
 
 - ### Inference Graph Optimizers
 
-  Apart from the refined acceleration of convolutions and GeMM for the supported data precisions, Bolt has a sophisticated inference graph optimizer. As shown in [model-tools/include](model-tools/include), classic operator fusion is supported. Bolt is also equipped with a Memory Reuse Optmizer, which reassigns the space occupied by a feature map as soon as it is no longer needed as input or output. Most networks that we tested benefit from a two-third reduction in feature map storage.
+  Apart from the refined acceleration of convolutions and GeMM for the supported data precisions, Bolt has a easy use and powerful inference graph optimizer. As shown in [model-tools/include](model-tools/include), classic operator fusion is supported. Bolt is also equipped with a Memory Reuse Optmizer, which reassigns the space occupied by a feature map as soon as it is no longer needed as input or output. Most networks that we tested benefit from a two-third reduction in feature map storage.
 
 - ### Thread Affinity Setting
 
@@ -93,11 +102,12 @@ We provide a detailed benchmark report for your reference. For more testing info
 
 # Road Map
 
-#### v0.3.0
+#### v0.4.0
 
-Future Release 2020-04-01
+Future Release 2020-09-01
 
-- GPU
+- Yolo support
+- TensorFlow model converter
 
 # Who are using Bolt
 
@@ -106,27 +116,31 @@ Future Release 2020-04-01
 
 # FAQ
 
-1. More details about dependency libraries for cross-compilation?
+1. Why configuring bolt.cmake does not take effect?
+
+   The [install.sh](install.sh) serves as an example of compilation setup, and it overwrites some settings in [bolt.cmake](bolt.cmake). Please check install.sh first.
+
+2. More details about dependency libraries for cross-compilation?
 
    The major dependency is Protobuf. Protoc should be the x86 version but protbuf should be the ARM version.
 
-2. Requirements on tensor dimensions?
+3. Requirements on tensor dimensions?
 
-   For optimal performance, Bolt requires the number of output channels to be divisible by 8. For compatibility, Bolt will try to pad the output channels of convolution layers to the nearest multiple of 8. You can turn on DEBUG in [bolt.cmake](bolt.cmake) to check the actual dimensions.
+   For optimal performance, Bolt requires the number of output channels to be divisible by 8. For compatibility, Bolt will try to pad the output channels of convolution layers to the nearest multiple of 8. You can turn on USE_DEBUG in [bolt.cmake](bolt.cmake) to check the actual dimensions.
 
-3. Restrictions for BNN?
+4. Restrictions for BNN?
 
    For BNN convolution layers, the number of output channels must be divisible by 32.
 
-4. Restrictions on quantization (int8)?
+5. Restrictions on quantization (int8)?
 
-   For the time being, Bolt only supports post-training int8 quantization. If quantization is activated, the second convolution layer will quantize the tensors to 8-bit integers. For now, int8 operators include Convolution, Pooling and Concatenation (end-to-end support for Squeezenet). If your network includes other operators, you may need to add type casting in the front of those operators. The quantization method is symmetrical for both activation and weight.
+   For the time being, Bolt only supports post-training int8 quantization. The quantization method is symmetrical for both activation and weight. We have added a calibration tool for image CNN pipelines. Please feel free to report cases of usage failures.
 
-5. Requirements for fp16 and int8?
+6. Requirements for fp16 and int8?
 
    Only arm-v8.2 supports fp16 and int8 dotprod instructions. 
 
-6. Restrictions for MALI?
+7. Restrictions for MALI?
 
    Only llvm compilation supports MALI computing.
 

diff --git a/...HIRD PARTY OPEN SOURCE SOFTWARE NOTICE.md → THIRD PARTY OPEN SOURCE SOFTWARE NOTICE.md b/...HIRD PARTY OPEN SOURCE SOFTWARE NOTICE.md → THIRD PARTY OPEN SOURCE SOFTWARE NOTICE.md
diff --git a/blas-enhance/include/blas-enhance.h b/blas-enhance/include/blas-enhance.h
@@ -36,6 +36,31 @@ extern "C" {
                               U32 bytes, void* tmp,
                               TensorDesc resultDesc, void* result, Arch arch);
 
+    inline DataFormat targetFormat4MatrixB(DataType dt)
+    {
+        switch (dt) {
+            case DT_F16: {
+                return DF_NKN24;
+            }
+            case DT_F32: {
+#ifdef __aarch64__
+                return DF_NKN12;
+#else
+                return DF_NKN8;
+#endif
+            }
+            case DT_I8: {
+                return DF_NKN12K4;
+            }
+            default: {
+                CHECK_STATUS(NOT_SUPPORTED);
+                exit(1);
+            }
+        }
+    }
+
+    EE matrix_matrix_multiply_transform_rhs(TensorDesc desc, const void* src, TensorDesc* descTran,void* dst);
+
 #ifdef __cplusplus
 }
 #endif

diff --git a/blas-enhance/src/CMakeLists.txt b/blas-enhance/src/CMakeLists.txt
@@ -1,4 +1,7 @@
-file(GLOB general_srcs ${CMAKE_CURRENT_SOURCE_DIR}/cpu/general/*.cpp)
+if (USE_GENERAL)
+    file(GLOB general_srcs ${CMAKE_CURRENT_SOURCE_DIR}/cpu/general/*.cpp)
+endif (USE_GENERAL)
+
 if (USE_NEON)
     if (USE_FP16)
         file(GLOB arm_fp16_srcs ${CMAKE_CURRENT_SOURCE_DIR}/cpu/arm/fp16/*.cpp)

diff --git a/blas-enhance/src/cpu/arm/arm_neon_expand.h b/blas-enhance/src/cpu/arm/arm_neon_expand.h
diff --git a/blas-enhance/src/cpu/arm/blas_arm.h b/blas-enhance/src/cpu/arm/blas_arm.h
@@ -15,6 +15,7 @@
 #ifndef _H_BLAS_ARM
 #define _H_BLAS_ARM
 
+#include "error.h"
 #include "sys.h"
 #include "type.h"
 
@@ -37,4 +38,13 @@ EE mmm_arm(U32 matrixC_N, U32 matrixC_M, U32 matrixA_K,
      void* matrixCData,
      Arch arch);
 
+inline U32 pad_to_4_multiple(U32 k)
+{
+    if (k % 4 == 0) {
+        return k;
+    } else {
+        return (k / 4) * 4 + 4;
+    }
+}
+
 #endif
diff --git a/blas-enhance/src/cpu/arm/fp16/blas_fp16.h b/blas-enhance/src/cpu/arm/fp16/blas_fp16.h
@@ -18,12 +18,17 @@
 #include "sys.h"
 #include "type.h"
 #include "error.h"
+#include "tensor_desc.h"
 
 
 EE mvm_fp16(U32 row, U32 col, bool transpose, F16* matrix, F16* vector, F16* result, Arch arch);
 
 void matrix_matrix_multiply_tmp_bytes_fp16(U32 row1, U32 col1, U32 row2, U32 col2, DataType dt, U32 *bytes);
 
+EE matrix_matrix_multiply_transform_rhsN_fp16(TensorDesc desc, F16* src, F16* dst);
+
+EE matrix_matrix_multiply_transform_rhsT_fp16(TensorDesc desc, F16* src, F16* dst);
+
 EE mmm_fp16(int M, int N, int K, F16* matrix1, F16* matrix2, F16* tmp, F16* result, Arch arch);
 
 #endif
diff --git a/blas-enhance/src/cpu/arm/fp16/mmm.cpp b/blas-enhance/src/cpu/arm/fp16/mmm.cpp
@@ -16,15 +16,59 @@
 #include "error.h"
 #include "cpu/arm/fp16/blas_fp16.h"
 #include "mmm.h"
+#include "mmm_common.h"
 
 
 void matrix_matrix_multiply_tmp_bytes_fp16(U32 row1, U32 col1, U32 row2, U32 col2, DataType dt, U32 *bytes)
 {
     *bytes = row1 * col1 + row2 * col2;
-    *bytes *= bytesOf (dt);
+    *bytes *= bytesOf(dt);
     *bytes += 32;
 }
 
+EE matrix_matrix_multiply_transform_rhsN_fp16(TensorDesc desc, F16* src, F16* dst)
+{
+    DataType dt;
+    U32 N, K;
+    CHECK_STATUS(tensor2dGet(desc, &dt, &K, &N));
+    int i = 0;
+    for (; i < (int)N - 23; i += 24) {
+        matrix2_trans(24, K, N, src + i, dst + i * K);
+    }
+    for (; i < (int)N - 7; i += 8) {
+        matrix2_trans(8, K, N, src + i, dst + i * K);
+    }
+    for (; i < (int)N - 3; i += 4) {
+        matrix2_trans(4, K, N, src + i, dst + i * K);
+    }
+    if ((int)N > i) {
+        matrix2_trans(N - i, K, N, src + i, dst + i * K);
+    }
+    return SUCCESS;
+}
+
+EE matrix_matrix_multiply_transform_rhsT_fp16(TensorDesc desc, F16* src, F16* dst)
+{
+    DataType dt;
+    U32 N, K;
+    CHECK_STATUS(tensor2dGet(desc, &dt, &N, &K));
+    int i = 0;
+    for (; i < (int)N - 23; i += 24) {
+        matrix1_trans(24, K, K, src + i * K, dst + i * K);
+    }
+    for (; i < (int)N - 7; i += 8) {
+        matrix1_trans(8, K, K, src + i * K, dst + i * K);
+    }
+    for (; i < (int)N - 3; i += 4) {
+        matrix1_trans(4, K, K, src + i * K, dst + i * K);
+    }
+    if ((int)N > i) {
+        matrix1_trans(N - i, K, K, src + i * K, dst + i * K);
+    }
+    return SUCCESS;
+}
+
+
 EE mmm_fp16(int M, int N, int K, F16* matrix1, F16* matrix2, F16* tmp, F16* result, Arch arch)
 {
     EE ret = SUCCESS;