Skip to content

Commit

Permalink
bolt v0.3.0 is released
Browse files Browse the repository at this point in the history
  • Loading branch information
jianfeifeng committed Jun 5, 2020
1 parent a193d96 commit 331a833
Show file tree
Hide file tree
Showing 598 changed files with 40,759 additions and 13,573 deletions.
27 changes: 21 additions & 6 deletions CMakeLists.txt
Original file line number Diff line number Diff line change
@@ -1,4 +1,4 @@
cmake_minimum_required(VERSION 3.15)
cmake_minimum_required(VERSION 3.2)

file(GLOB BOLT_CONFIGURE_FILE $ENV{BOLT_ROOT}/bolt.cmake ${BOLT_ROOT}/bolt.cmake)
if (BOLT_CONFIGURE_FILE)
Expand All @@ -20,9 +20,11 @@ add_subdirectory(model-tools)
add_subdirectory(tensor_computing)
add_subdirectory(image)
add_subdirectory(inference)
add_subdirectory(tools)
add_subdirectory(kits)
add_subdirectory(tests)
add_custom_target(bolt_library ALL
COMMAND ./scripts/build_light_bolt.sh ${CMAKE_BINARY_DIR} ${USE_MALI}
COMMAND ./CI_SCRIPTS/build_light_bolt.sh ${CMAKE_BINARY_DIR} ${USE_MALI} ${USE_DEBUG} ${USE_LLVM_CLANG} ${CMAKE_CXX_COMPILER} ${CMAKE_AR} ${CMAKE_STRIP}
WORKING_DIRECTORY $ENV{BOLT_ROOT})

if (USE_MALI)
Expand All @@ -38,6 +40,7 @@ add_dependencies(tensor_computing blas-enhance)
add_dependencies(tensor_computing_static blas-enhance_static)
add_dependencies(inference tensor_computing model-tools image)
add_dependencies(inference_static tensor_computing_static model-tools_static image_static)
add_dependencies(bolt_library inference)
add_dependencies(bolt_library inference_static)

install(TARGETS blas-enhance blas-enhance_static
Expand Down Expand Up @@ -82,22 +85,34 @@ install(DIRECTORY model-tools/tools/tensorflow2caffe
model-tools/tools/pytorch2caffe
DESTINATION tools)

install(TARGETS tensor_computing_library_search
RUNTIME DESTINATION tools)
if (USE_LIBRARY_TUNING)
install(TARGETS tensor_computing_library_search
RUNTIME DESTINATION tools)
endif (USE_LIBRARY_TUNING)

if (BUILD_TEST)
if (USE_INT8)
install(TARGETS ptq_calibration
RUNTIME DESTINATION tools)
endif(USE_INT8)
install(TARGETS classification
bert
tinybert
nmt
RUNTIME DESTINATION bin)
asr_rnnt
asr_convolution_transformer
tts
vad
RUNTIME DESTINATION kits)
endif(BUILD_TEST)

install(DIRECTORY inference/exports/java
inference/exports/c
DESTINATION include)

install(FILES ${CMAKE_BINARY_DIR}/libBoltModel.so
${CMAKE_BINARY_DIR}/libbolt.a
${CMAKE_BINARY_DIR}/libbolt.so
DESTINATION lib)

execute_process(COMMAND doxygen .Doxyfile WORKING_DIRECTORY $ENV{BOLT_ROOT})
Expand All @@ -107,7 +122,7 @@ enable_testing()
find_program (BASH_PROGRAM bash)

if (BASH_PROGRAM)
set(parameters -b $ENV{BOLT_ROOT}/tests/bin -p /data/local/tmp/uldra)
set(parameters -t $ENV{BOLT_ROOT}/tests/bin -k $ENV{BOLT_ROOT}/kits/bin -p /data/local/tmp/uldra)
if (USE_MALI)
set(parameters ${parameters} -g)
endif(USE_MALI)
Expand Down
File renamed without changes.
48 changes: 31 additions & 17 deletions README.md
Original file line number Diff line number Diff line change
Expand Up @@ -8,30 +8,39 @@ Bolt is a light-weight library for mobile devices. Bolt, as a universal deployme

- ### Overview

Bolt is highly optimized for ARMv8.2 CPUs, supporting fast inference of FP16, INT8 and BNN networks. Recently, FP32 functionality has been integrated, which also works on ARMv8 devices.
Bolt has almost supported all the ARM-A devices incude ARMv7/ARMv8/ARMv8.2/Mali-GPU. FP16/BNN for CPU and FP16 for GPU are highly optimized. Bolt also support FP32 on ARMv7/ARMv8/ARMv8.2 devices.

Bolt has its own format of model storage, which helps reduce the memory footprint by storing in FP16 and 1-bit representations when possible. We provide model converters for the following formats:
Bolt has its own format of model storage, which helps reduce the memory footprint by storing in FP16, INT8 and 1-bit representations when possible. We provide model converters for the following formats:

- caffe
- onnx
- tflite

For PyTorch and TensorFlow models, please try to convert them to the onnx format first. We also had some success in converting these models into customized caffe models.
For PyTorch and TensorFlow models, please try to convert them to the onnx or tflite format first. We also had some success in converting these models into customized caffe models.

- ### Verified Networks

Bolt has shown its high performance in the inference of common CV and NLP neural networks. Some of the representative networks that we have verified are listed below. You can find detailed benchmark information in [docs/BENCHMARK.md](docs/BENCHMARK.md).

- Squeezenet (full-network int8 quantization)
- Mobilenet v1 - v3
- Squeezenet
- Mobilenet v1, v2, v3
- Resnet50, [Ghostnet](https://github.com/huawei-noah/ghostnet) (plus FPN detection)
- Birealnet18 (BNN)
- SSD(Resnet)
- Bert, TinyBert, Albert
- Neural Machine Translation
- Automatic Speech Recognition
- Text To Speech

For MALI GPU FP16 Support
- Squeezenet v1.1
- Mobilenet v1, v2, v3
- Ghostnet


- ### Inference Graph Optimizers

Apart from the refined acceleration of convolutions and GeMM for the supported data precisions, Bolt has a sophisticated inference graph optimizer. As shown in [model-tools/include](model-tools/include), classic operator fusion is supported. Bolt is also equipped with a Memory Reuse Optmizer, which reassigns the space occupied by a feature map as soon as it is no longer needed as input or output. Most networks that we tested benefit from a two-third reduction in feature map storage.
Apart from the refined acceleration of convolutions and GeMM for the supported data precisions, Bolt has a easy use and powerful inference graph optimizer. As shown in [model-tools/include](model-tools/include), classic operator fusion is supported. Bolt is also equipped with a Memory Reuse Optmizer, which reassigns the space occupied by a feature map as soon as it is no longer needed as input or output. Most networks that we tested benefit from a two-third reduction in feature map storage.

- ### Thread Affinity Setting

Expand Down Expand Up @@ -93,11 +102,12 @@ We provide a detailed benchmark report for your reference. For more testing info

# Road Map

#### v0.3.0
#### v0.4.0

Future Release 2020-04-01
Future Release 2020-09-01

- GPU
- Yolo support
- TensorFlow model converter

# Who are using Bolt

Expand All @@ -106,27 +116,31 @@ Future Release 2020-04-01

# FAQ

1. More details about dependency libraries for cross-compilation?
1. Why configuring bolt.cmake does not take effect?

The [install.sh](install.sh) serves as an example of compilation setup, and it overwrites some settings in [bolt.cmake](bolt.cmake). Please check install.sh first.

2. More details about dependency libraries for cross-compilation?

The major dependency is Protobuf. Protoc should be the x86 version but protbuf should be the ARM version.

2. Requirements on tensor dimensions?
3. Requirements on tensor dimensions?

For optimal performance, Bolt requires the number of output channels to be divisible by 8. For compatibility, Bolt will try to pad the output channels of convolution layers to the nearest multiple of 8. You can turn on DEBUG in [bolt.cmake](bolt.cmake) to check the actual dimensions.
For optimal performance, Bolt requires the number of output channels to be divisible by 8. For compatibility, Bolt will try to pad the output channels of convolution layers to the nearest multiple of 8. You can turn on USE_DEBUG in [bolt.cmake](bolt.cmake) to check the actual dimensions.

3. Restrictions for BNN?
4. Restrictions for BNN?

For BNN convolution layers, the number of output channels must be divisible by 32.

4. Restrictions on quantization (int8)?
5. Restrictions on quantization (int8)?

For the time being, Bolt only supports post-training int8 quantization. If quantization is activated, the second convolution layer will quantize the tensors to 8-bit integers. For now, int8 operators include Convolution, Pooling and Concatenation (end-to-end support for Squeezenet). If your network includes other operators, you may need to add type casting in the front of those operators. The quantization method is symmetrical for both activation and weight.
For the time being, Bolt only supports post-training int8 quantization. The quantization method is symmetrical for both activation and weight. We have added a calibration tool for image CNN pipelines. Please feel free to report cases of usage failures.

5. Requirements for fp16 and int8?
6. Requirements for fp16 and int8?

Only arm-v8.2 supports fp16 and int8 dotprod instructions.

6. Restrictions for MALI?
7. Restrictions for MALI?

Only llvm compilation supports MALI computing.

Expand Down
File renamed without changes.
25 changes: 25 additions & 0 deletions blas-enhance/include/blas-enhance.h
Original file line number Diff line number Diff line change
Expand Up @@ -36,6 +36,31 @@ extern "C" {
U32 bytes, void* tmp,
TensorDesc resultDesc, void* result, Arch arch);

inline DataFormat targetFormat4MatrixB(DataType dt)
{
switch (dt) {
case DT_F16: {
return DF_NKN24;
}
case DT_F32: {
#ifdef __aarch64__
return DF_NKN12;
#else
return DF_NKN8;
#endif
}
case DT_I8: {
return DF_NKN12K4;
}
default: {
CHECK_STATUS(NOT_SUPPORTED);
exit(1);
}
}
}

EE matrix_matrix_multiply_transform_rhs(TensorDesc desc, const void* src, TensorDesc* descTran,void* dst);

#ifdef __cplusplus
}
#endif
Expand Down
5 changes: 4 additions & 1 deletion blas-enhance/src/CMakeLists.txt
Original file line number Diff line number Diff line change
@@ -1,4 +1,7 @@
file(GLOB general_srcs ${CMAKE_CURRENT_SOURCE_DIR}/cpu/general/*.cpp)
if (USE_GENERAL)
file(GLOB general_srcs ${CMAKE_CURRENT_SOURCE_DIR}/cpu/general/*.cpp)
endif (USE_GENERAL)

if (USE_NEON)
if (USE_FP16)
file(GLOB arm_fp16_srcs ${CMAKE_CURRENT_SOURCE_DIR}/cpu/arm/fp16/*.cpp)
Expand Down
78 changes: 0 additions & 78 deletions blas-enhance/src/cpu/arm/arm_neon_expand.h

This file was deleted.

10 changes: 10 additions & 0 deletions blas-enhance/src/cpu/arm/blas_arm.h
Original file line number Diff line number Diff line change
Expand Up @@ -15,6 +15,7 @@
#ifndef _H_BLAS_ARM
#define _H_BLAS_ARM

#include "error.h"
#include "sys.h"
#include "type.h"

Expand All @@ -37,4 +38,13 @@ EE mmm_arm(U32 matrixC_N, U32 matrixC_M, U32 matrixA_K,
void* matrixCData,
Arch arch);

inline U32 pad_to_4_multiple(U32 k)
{
if (k % 4 == 0) {
return k;
} else {
return (k / 4) * 4 + 4;
}
}

#endif
5 changes: 5 additions & 0 deletions blas-enhance/src/cpu/arm/fp16/blas_fp16.h
Original file line number Diff line number Diff line change
Expand Up @@ -18,12 +18,17 @@
#include "sys.h"
#include "type.h"
#include "error.h"
#include "tensor_desc.h"


EE mvm_fp16(U32 row, U32 col, bool transpose, F16* matrix, F16* vector, F16* result, Arch arch);

void matrix_matrix_multiply_tmp_bytes_fp16(U32 row1, U32 col1, U32 row2, U32 col2, DataType dt, U32 *bytes);

EE matrix_matrix_multiply_transform_rhsN_fp16(TensorDesc desc, F16* src, F16* dst);

EE matrix_matrix_multiply_transform_rhsT_fp16(TensorDesc desc, F16* src, F16* dst);

EE mmm_fp16(int M, int N, int K, F16* matrix1, F16* matrix2, F16* tmp, F16* result, Arch arch);

#endif
46 changes: 45 additions & 1 deletion blas-enhance/src/cpu/arm/fp16/mmm.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -16,15 +16,59 @@
#include "error.h"
#include "cpu/arm/fp16/blas_fp16.h"
#include "mmm.h"
#include "mmm_common.h"


void matrix_matrix_multiply_tmp_bytes_fp16(U32 row1, U32 col1, U32 row2, U32 col2, DataType dt, U32 *bytes)
{
*bytes = row1 * col1 + row2 * col2;
*bytes *= bytesOf (dt);
*bytes *= bytesOf(dt);
*bytes += 32;
}

EE matrix_matrix_multiply_transform_rhsN_fp16(TensorDesc desc, F16* src, F16* dst)
{
DataType dt;
U32 N, K;
CHECK_STATUS(tensor2dGet(desc, &dt, &K, &N));
int i = 0;
for (; i < (int)N - 23; i += 24) {
matrix2_trans(24, K, N, src + i, dst + i * K);
}
for (; i < (int)N - 7; i += 8) {
matrix2_trans(8, K, N, src + i, dst + i * K);
}
for (; i < (int)N - 3; i += 4) {
matrix2_trans(4, K, N, src + i, dst + i * K);
}
if ((int)N > i) {
matrix2_trans(N - i, K, N, src + i, dst + i * K);
}
return SUCCESS;
}

EE matrix_matrix_multiply_transform_rhsT_fp16(TensorDesc desc, F16* src, F16* dst)
{
DataType dt;
U32 N, K;
CHECK_STATUS(tensor2dGet(desc, &dt, &N, &K));
int i = 0;
for (; i < (int)N - 23; i += 24) {
matrix1_trans(24, K, K, src + i * K, dst + i * K);
}
for (; i < (int)N - 7; i += 8) {
matrix1_trans(8, K, K, src + i * K, dst + i * K);
}
for (; i < (int)N - 3; i += 4) {
matrix1_trans(4, K, K, src + i * K, dst + i * K);
}
if ((int)N > i) {
matrix1_trans(N - i, K, K, src + i * K, dst + i * K);
}
return SUCCESS;
}


EE mmm_fp16(int M, int N, int K, F16* matrix1, F16* matrix2, F16* tmp, F16* result, Arch arch)
{
EE ret = SUCCESS;
Expand Down
Loading

0 comments on commit 331a833

Please sign in to comment.